/*	ncbl2_lib.c	functions to read ncbi-blast format files from
			formatdb (blast2.0 format files)

		copyright (c) 1999 William R. Pearson
*/

/* $Name:  $ - $Id: ncbl2_mlib.c 625 2011-03-23 17:21:38Z wrp $ */

/* to turn on mmap()ing for Blast2 files: */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#ifdef UNIX
#include <unistd.h>
#endif
#include <errno.h>


/* ****************************************************************

17-May-2006

Modified to read NCBI .[np]al and .msk files.  The .nal or .pal file
provides a way to read sequences from a list of files.  The .msk file
provides a compact way of indicating the subset of sequences in a
larger database (typically nr or nt) that comprise a smaller database
(e.g. swissprot or pdbaa).  A .pal file (e.g. swissprot.00.pal) that
uses a .msk file has the form:

	# Alias file generated by genmask
	# Date created: Mon Apr 10 11:24:05 2006
	#
	TITLE     Non-redundant SwissProt sequences
	DBLIST    nr.00
	OIDLIST   swissprot.00.msk
	LENGTH    74351250
	NSEQ      198346
	MAXOID    2617347
	MEMB_BIT 1
	# end of the file

To work with this file, we must first load the nr.00 file, and then
read the swissprot.00.msk file, and then scan all the entries in the
swissprot.00.msk file (which are packed 32 mask-bit to an int) to
determine whether a specific libpos index entry is present in the
subset database.

**************************************************************** */


/* ****************************************************************
This code reads NCBI Blast2 format databases from formatdb version 3 and 4

(From NCBI) This section describes the format of the databases.

Formatdb creates three main files for proteins containing indices,
sequences, and headers with the extensions, respectively, of pin, psq,
and phr (for nucleotides these are nin, nsq, and nhr).  A number of
other ISAM indices are created, but these are described elsewhere.

FORMAT OF THE INDEX FILE
------------------------

1.) formatdb version number 	[4 bytes].

2.) protein dump flag (1 for a protein database, 0 for a nucleotide
    database) [4 bytes].

3.) length of the database title in bytes	[4 bytes].
4.) the database title		[length given in 3.)].
5.) length of the date/time string	[4 bytes].
6.) the date/time string	[length given in 5.)].
7.) the number of sequences in the database	[4 bytes].
8.) the total length of the database in residues/basepairs	[4 bytes].
9.) the length of the longest sequence in the database 		[4 bytes].

10.) a list of the offsets for definitions (one for each sequence) in
the header file.  There are num_of_seq+1 of these, where num_of_seq is
the number of sequences given in 7.).

11.) a list of the offsets for sequences (one for each sequence) in
the sequence file.  There are num_of_seq+1 of these, where num_of_seq
is the number of sequences given in 7.).

12.) a list of the offsets for the ambiguity characters (one for each
sequence) in the sequence file.  This list is only present for
nucleotide databases and, since the database is compressed 4/1 for
nucleotides, allows the ambiguity characters to be restored when the
sequence is generated.  There are num_of_seq+1 of these, where
num_of_seq is the number of sequences given in 7.).


FORMAT OF THE SEQUENCE FILE
---------------------------

There are different formats for the protein and nucleotide sequence files.

The protein sequence files is quite simple.  The first byte in the
file is a NULL byte, followed by the sequence in ncbistdaa format
(described in the NCBI Software Development Toolkit documentation).
Following the sequence is another NULL byte, followed by the next
sequence.  The file ends with a NULL byte, following the last
sequence.

The nucleotide sequence file contains the nucleotide sequence, with
four basepairs compressed into one byte.  The format used is NCBI2na,
documented in the NCBI Software Development Toolkit manual.  Any
ambiguity characters present in the original sequence are replaced at
random by A, C, G or T.  The true value of ambiguity characters are
stored at the end of each sequence to allow true reproduction of the
original sequence.

FORMAT OF THE HEADER FILE  (formatdb version 3)
-------------------------

The format of the header file depends on whether or not the identifiers in the
original file were parsed or not.  For the case that they were not, then each
entry has the format:

gnl|BL_ORD_ID|entry_number my favorite yeast sequence...

Here entry_number gives the ordinal number of the sequence in the
database (with zero offset).  The identifier
gnl|BL_ORD_ID|entry_number is used by the BLAST software to identify
the entry, if the user has not provided another identifier.  If the
identifier was parsed, then gnl|BL_ORD_ID|entry_number is replaced by
the correct identifier, as described in
ftp://ncbi.nlm.nih.gov/blast/db/README .

There are no separators between these deflines.

For formatdb version 4, the header file contains blast ASN.1 binary
deflines, which can parsed with parse_fastadl_asn().

FORMAT OF THE .MSK FILE
-----------------------

The .msk file is simply a packed list of masks for formatdb "oids" for
some other file (typically nr).  The first value is the last oid
available; the remainder are packed 32 oids/mask, so that the number
of masks is 1/32 the number of sequences in the file.

**************************************************************** */

#ifdef USE_MMAP
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#ifdef IBM_AIX
#include <fcntl.h>
#else
#include <sys/fcntl.h>
#endif
#endif

#ifdef USE_MMAP
#ifndef MAP_FILE
#define MAP_FILE 0
#endif
#endif

#ifdef UNIX
#define RBSTR "r"
#else
#define RBSTR "rb"
#endif

#ifdef WIN32
#define SLASH_CHAR '\\'
#define SLASH_STR "\\"
#else
#define SLASH_CHAR '/'
#define SLASH_STR "/"
#endif

#define XTERNAL
#include "uascii.h"

#define XTERNAL
#include "upam.h"
#include "ncbl2_head.h"

#include "defs.h"
#include "structs.h"
#include "mm_file.h"

unsigned int bl2_uint4_cvt(unsigned int);
unsigned int bl2_long4_cvt(long);
uint64_t bl2_long8_cvt(uint64_t);
void src_int4_read(FILE *fd,  int *valp);
void src_uint4_read(FILE *fd,  unsigned int *valp);
void src_long4_read(FILE *fd,  long *valp);
void src_long8_read(FILE *fd,  int64_t *val);
void ncbi_long8_read(FILE *fd,  int64_t *valp);
void src_char_read(FILE *fd,  char *valp);
unsigned char *parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
				 int *gi_p, int *db, char *acc,  char *name,
				 char *title, int t_len, int *taxid);

/* nt_btoa maps  from blast 2bit  format to ascii  characters */
static char nt_btoa[5] = {"ACGT"};

static char *aa_b2toa= "-ABCDEFGHIKLMNPQRSTVWXYZU*OJ";

static int aa_btof[32];	/* maps to fasta alphabet */
static int aa_btof_null = 0;

static int dbtype, dbformat, amb_cnt;

#define NCBIBL20 12

struct lmf_str *load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype);

int ncbl2_getliba(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
int ncbl2_getlibn(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);

int ncbl2_getliba_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);
int ncbl2_getlibn_o(unsigned char *, int, char *, int, fseek_t *, int *, struct lmf_str *, long *);

void newname(char *, char *, char *, int);
void parse_pal(char *, char *, int *, int *, FILE *);

void ncbl2_ranlib(char *, int, fseek_t, char *, struct lmf_str *m_fd);

/* ncbl2_openlib() is used to open (and memory map) a BLAST2.0 format
   file.  Ifdef USE_MMAP, then ncbl2_openlib returns a structure that can
   be used to read the database. */
   
struct lmf_str *
ncbl2_openlib(struct lib_struct *lib_p,  int ldnaseq)
{
  char lname[256];	/* .pal, .nal file name */
  char dname[256];	/* .pin, .nin file for files included from .msk files */
  char msk_name[256];	/* .msk file name */
  char hname[256];	/* .phr, .nhr */
  char sname[256];	/* .psq, .nsq */
  char tname[256];	/* .pin, .nin file */
  char db_dir[256];	/*  directory where all the files live */
  int pref_db= -1;	/*  right now, only swissprot.pal, pdbaa.pal
			    are used for masked OID files */
  char *bp;
  int oid_seqs, max_oid, have_oid_list;
  int oid_cnt, oid_len;
  unsigned int *oid_list, o_max;
  int tmp;
  int i;
#ifdef USE_MMAP
  struct stat statbuf;
#endif
  FILE *ifile;	/* index offsets, also DB info */
  struct lmf_str *m_fptr;

  /* this function should be reorganized
  (1) check to see if there is a .nal/.pal file before doing things
      with names, since the names might be wrong. 
  (2) if there is a .nal/.pal file, check to see if it has a DBLIST
      (later), if it does, then modify the lib_p->next chain
      if it does not, then generate the appropriate file names, and
      read the oid list
  (3) otherwise (no .nal/.pal file), generate the file names


  */

  if (ldnaseq==SEQT_PROT) {
    newname(lname,lib_p->file_name,AA_LIST_EXT,(int)sizeof(lname));	/* .pal */
  }
  else {
    newname(lname,lib_p->file_name,NT_LIST_EXT,(int)sizeof(lname));	/* .nal */
  }
  
  /* check for a .nal/.pal OID list file */
  max_oid = oid_seqs = 0;
  oid_list = NULL;

  /* here, we check for a .pal/.nal file by trying to open it */
  if ((ifile = fopen(lname,"r"))!=NULL) {

    if ((bp = strrchr(lib_p->file_name,SLASH_CHAR))!=NULL) {
      *bp = '\0';
      strncpy(db_dir,lib_p->file_name,sizeof(db_dir));
      strncat(db_dir,SLASH_STR,sizeof(db_dir)-strlen(db_dir)-1);
      *bp = SLASH_CHAR;
    }
    else {
      db_dir[0]='\0';
    }

    /* we have a list file, we need to parse it */
    parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
    fclose(ifile);

    pref_db = -1;

    if (oid_seqs > 0) {

      have_oid_list = 1;
      /* get the pref_db before adding the directory */
      if (strncmp(msk_name,"swissprot",9)==0) {
	pref_db = 7;
      }
      else if (strncmp(msk_name,"pdbaa",5)==0) {
	pref_db = 14;
      }

      /* need to add directory to both dname and msk_name */
      strncpy(tname,db_dir,sizeof(tname));
      strncat(tname,msk_name, sizeof(tname));
      strncpy(msk_name, tname, sizeof(msk_name));

      strncpy(tname,db_dir,sizeof(tname));
      strncat(tname,dname, sizeof(tname));
      strncpy(dname,tname,sizeof(dname));

      if (ldnaseq == SEQT_PROT) {
	newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
	newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
	newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
      }
      else {	/* reading DNA library */
	newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
	newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
	newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
      }
      /* now load the oid file */
      if ((ifile = fopen(msk_name,RBSTR))==NULL) {
	fprintf(stderr,"error - cannot load %s file\n",msk_name);
	return NULL;
      }
      else {
	src_uint4_read(ifile,&o_max);
	if (o_max != max_oid) {
	  fprintf(stderr," error - oid count mismatch %d != %d\n",max_oid, o_max);
	}
	oid_len = (max_oid/32+1);
	if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
	  fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
	  return NULL;
	}
	if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
	  fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
	  return NULL;
	}
	fclose(ifile);
      }
    }
    else {	/* we had a .msk file, but there are no oid's in
		   it. */
      fprintf(stderr," *** WARNING -- found .pal/.nal file %s with no OIDs\n",lname);
      return NULL;
    }
  }
  else { /* else no OID/.msk file -- generate the names */
    have_oid_list = 0;

    /* initialize file names */
    if (ldnaseq==SEQT_PROT) {	/* read a protein database */
      newname(tname,lib_p->file_name,AA_INDEX_EXT,(int)sizeof(tname));	/* .pin */
      newname(hname,lib_p->file_name,AA_HEADER_EXT,(int)sizeof(hname));	/* .phr */
      newname(sname,lib_p->file_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .psq */

    }
    else {	/* reading DNA library */
      newname(tname,lib_p->file_name,NT_INDEX_EXT,(int)sizeof(tname));	/* .nin */
      newname(hname,lib_p->file_name,NT_HEADER_EXT,(int)sizeof(hname));	/* .nhr */
      newname(sname,lib_p->file_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .nsq */
    }
  }	


  if (ldnaseq == SEQT_PROT) {
    /* initialize map of BLAST2 amino acids to FASTA amino acids */
    for (i=0; aa_b2toa[i]; i++) {
      if ((tmp=aascii[aa_b2toa[i]])<NA) {
	aa_btof[i]=tmp;
	if (aa_b2toa[i] == 'O' || aa_b2toa[i] == 'o') aa_btof[i] = aascii['K'];
	else if (aa_b2toa[i] == 'U' || aa_b2toa[i] == 'u') aa_btof[i] = aascii['C'];
      }
      else if (aa_b2toa[i]=='*') aa_btof[i]=aascii['X'];
      else aa_btof[i]=0;
/*    else aa_btof[i]=aascii['X']; */
    }

    /* check to see if aa_btof[] actually does anything interesting */
    aa_btof_null = 1;
    for (i=0; i<sizeof(aa_b2toa); i++) {
      if (i != aa_btof[i]) {
#ifdef DEBUG
	fprintf(stderr," difference at: i: %d aa_btof[i]: %d\n",i,aa_btof[i]);
#endif
	aa_btof_null = 0;
      }
    }
  }	/* else no OID/.msk file */


  /* now we have all the file names, open the files and read the data */
  /* open the index/header file, and read the sequence type info  */
  if ((ifile = fopen(tname,RBSTR))==NULL) {
    fprintf(stderr," cannot open %s (%s) INDEX file",tname,lib_p->file_name);
    perror("...");
    return NULL;
  }
  src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
  src_uint4_read(ifile,(unsigned *)&dbtype);   /* get 1 for protein/0 DNA */

  if (dbformat != FORMATDBV3 && dbformat!=FORMATDBV4) {
    fprintf(stderr,"error - %s wrong formatdb version (%d/%d)\n",
	    tname,dbformat,FORMATDBV3);
    return NULL;
  }

  if ((ldnaseq==SEQT_PROT && dbtype != AAFORMAT) || 
      (ldnaseq==SEQT_DNA && dbtype!=NTFORMAT)) {
    fprintf(stderr,"error - %s wrong format (%d/%d)\n",
	    tname,dbtype,(ldnaseq ? NTFORMAT: AAFORMAT));
    return NULL;
  }

  /* the files are there - allocate lmf_str */
  if ((m_fptr=(struct lmf_str *)calloc(1,sizeof(struct lmf_str)))==NULL) {
    fprintf(stderr," cannot allocate lmf_str\n");
    return NULL;
  }

  m_fptr->tmp_buf_max = 4096;
  if ((m_fptr->tmp_buf=
       (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
    fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
    return NULL;
  }

  /* load the oid info */
  m_fptr->have_oid_list = have_oid_list;
  m_fptr->max_oid = max_oid;
  m_fptr->oid_seqs = oid_seqs;
  m_fptr->oid_list = oid_list;
  m_fptr->pref_db= pref_db;

  /* open the header file */
  if ((m_fptr->hfile = fopen(hname,RBSTR))==NULL) {
    fprintf(stderr," cannot open %s header file\n",hname);
    goto error_r;
  }

  /* ncbl2_ranlib is used for all BLAST2.0 access */
  m_fptr->ranlib = ncbl2_ranlib;
  m_fptr->bl_format_ver = dbformat;

  if (ldnaseq==SEQT_DNA) {
    if (oid_seqs > 0) {
      m_fptr->getlib = ncbl2_getlibn_o;
    }
    else {
      m_fptr->getlib = ncbl2_getlibn;
    }
    m_fptr->sascii = nascii;
  }
  else {
    if (oid_seqs > 0) {
      m_fptr->getlib = ncbl2_getliba_o;
    }
    else {
      m_fptr->getlib = ncbl2_getliba;
    }
    m_fptr->sascii = aascii;
  }
  m_fptr->lb_name = lib_p->file_name;

  /* open the sequence file */

#if defined (USE_MMAP) 
  m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
  if (!m_fptr->mm_flg) {
    fprintf(stderr," cannot open %s",sname);
    perror("...");
  }
  else {
    if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
      fprintf(stderr," cannot fstat %s",sname);
      perror("...");
      m_fptr->mm_flg = 0;
    }
    else {
      m_fptr->st_size = statbuf.st_size;
      if((m_fptr->mmap_base = 
	  mmap(NULL, m_fptr->st_size, PROT_READ,
	       MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
	fprintf(stderr," cannot mmap %s",sname);
	perror("...");
	m_fptr->mm_flg = 0;
      }  
      else {
	m_fptr->mmap_addr = m_fptr->mmap_base;
	m_fptr->mm_flg = 1;
      }
    }
    /* regardless, close the open()ed version */
    close(m_fptr->mmap_fd);
  }
#else
  m_fptr->mm_flg = 0;
#endif

  if  (!m_fptr->mm_flg) {
    if ((m_fptr->libf = fopen(sname,RBSTR))==NULL) {
      fprintf(stderr," cannot open %s sequence file",sname);
      perror("...");
      goto error_r;
    }
  }

/* all files should be open -- the rest of the work can be done by a
   function common to ncbl2_openlib() and ncbl2_reopen()
*/

  return load_ncbl2(m_fptr, ifile, dbformat, dbtype);

 error_r:
  /* here if failure after m_fptr allocated */
  free(m_fptr);
  return NULL;
}


/* **************************************************************** */
/* re-open an already opened library file using information in m_fptr

   a valid m_fptr guarantees that the necessary files exist, and we
   know whether there is an OID file.  m_fptr's are NEVER used for
   file lists, only for files with sequence data
*/
/* **************************************************************** */

struct lmf_str *ncbl2_reopen(struct lmf_str *m_fptr) {
  char lname[256];	/* .pal, .nal file name */
  char dname[256];	/* .pin, .nin file for files included from .msk files */
  char msk_name[256];	/* .msk file name */
  char hname[256];	/* .phr, .nhr */
  char sname[256];	/* .psq, .nsq */
  char tname[256];	/* .pin, .nin file */
  char db_dir[256];	/*  directory where all the files live */
  int pref_db= -1;	/*  right now, only swissprot.pal, pdbaa.pal
			    are used for masked OID files */
  char *bp;
  int oid_seqs, max_oid, have_oid_list;
  int oid_cnt, oid_len;
  unsigned int *oid_list, o_max;
#ifdef USE_MMAP
  struct stat statbuf;
#endif
  FILE *ifile;	/* index offsets, also DB info */

  /* if we have an oid list, open it, read it, and use it to get the file names */
  /* check for a .nal/.pal OID list file */
  max_oid = oid_seqs = 0;
  oid_list = NULL;

  if (m_fptr->have_oid_list) {
    if (m_fptr->lib_aa==1) {
      newname(lname,m_fptr->lb_name,AA_LIST_EXT,(int)sizeof(lname));	/* .pal */
    }
    else {
      newname(lname,m_fptr->lb_name,NT_LIST_EXT,(int)sizeof(lname));	/* .nal */
    }
  
    ifile = fopen(lname,"r");	/* it has to open, it did before */
    if ((bp = strrchr(m_fptr->lb_name,SLASH_CHAR))!=NULL) {
      *bp = '\0';
      strncpy(db_dir,m_fptr->lb_name,sizeof(db_dir));
      strncat(db_dir,SLASH_STR,sizeof(db_dir)-strlen(db_dir)-1);
      *bp = SLASH_CHAR;
    }
    else {
      db_dir[0]='\0';
    }

    /* we have a list file, we need to parse it */
    parse_pal(dname, msk_name, &oid_seqs, &max_oid, ifile);
    fclose(ifile);

    /* we have read the .pal/.nal file, now deal with the .msk file */

    pref_db = -1;

    /* get the pref_db before adding the directory */
    if (strncmp(msk_name,"swissprot",9)==0) {
      pref_db = 7;
    }
    else if (strncmp(msk_name,"pdbaa",5)==0) {
      pref_db = 14;
    }

    /* need to add directory to both dname and msk_name */
    strncpy(tname,db_dir,sizeof(tname));
    strncat(tname,msk_name, sizeof(tname));
    strncpy(msk_name, tname, sizeof(msk_name));

    strncpy(tname,db_dir,sizeof(tname));
    strncat(tname,dname, sizeof(tname));
    strncpy(dname,tname,sizeof(dname));

    if (m_fptr->lib_aa) {
      newname(tname,dname,AA_INDEX_EXT,(int)sizeof(tname));
      newname(hname,dname,AA_HEADER_EXT,(int)sizeof(hname));
      newname(sname,dname,AA_SEARCHSEQ_EXT,(int)sizeof(sname));
    }
    else {	/* reading DNA library */
      newname(tname,dname,NT_INDEX_EXT,(int)sizeof(tname));
      newname(hname,dname,NT_HEADER_EXT,(int)sizeof(hname));
      newname(sname,dname,NT_SEARCHSEQ_EXT,(int)sizeof(sname));
    }

    ifile = fopen(msk_name,RBSTR);
    src_uint4_read(ifile,&o_max);
    oid_len = (max_oid/32+1);
    if ((oid_list=(unsigned int *)calloc(oid_len,sizeof(int)))==NULL) {
      fprintf(stderr," error - cannot allocate oid_list[%d]\n",oid_len);
      return NULL;
    }
    if ((oid_cnt=fread(oid_list,sizeof(int),oid_len,ifile))==0) {
      fprintf(stderr," error - cannot read oid_list[%d]\n",oid_len);
      return NULL;
    }
    fclose(ifile);
  }
  else { /* else no OID/.msk file -- generate the names */
    /* initialize file names */
    if (m_fptr->lib_aa) {	/* read a protein database */
      newname(tname,m_fptr->lb_name,AA_INDEX_EXT,(int)sizeof(tname));	/* .pin */
      newname(hname,m_fptr->lb_name,AA_HEADER_EXT,(int)sizeof(hname));	/* .phr */
      newname(sname,m_fptr->lb_name,AA_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .psq */

    }
    else {	/* reading DNA library */
      newname(tname,m_fptr->lb_name,NT_INDEX_EXT,(int)sizeof(tname));	/* .nin */
      newname(hname,m_fptr->lb_name,NT_HEADER_EXT,(int)sizeof(hname));	/* .nhr */
      newname(sname,m_fptr->lb_name,NT_SEARCHSEQ_EXT,(int)sizeof(sname));	/* .nsq */
    }
  }	

  /* now we have all the file names, open the files and read the data */
  /* open the index/header file, and read the sequence type info  */
  if ((ifile = fopen(tname,RBSTR))==NULL) {
    fprintf(stderr," cannot open %s (%s) INDEX file",tname,m_fptr->lb_name);
    perror("...");
    return NULL;
  }
  src_uint4_read(ifile,(unsigned *)&dbformat); /* get format DB version number */
  src_uint4_read(ifile,(unsigned *)&dbtype);   /* get 1 for protein/0 DNA */

  m_fptr->tmp_buf_max = 4096;
  if ((m_fptr->tmp_buf=
       (char *)calloc(m_fptr->tmp_buf_max,sizeof(char)))==NULL) {
    fprintf(stderr," cannot allocate lmf_str->tmp_buffer\n");
    return NULL;
  }

  /* load the oid info */
  m_fptr->max_oid = max_oid;
  m_fptr->oid_seqs = oid_seqs;
  m_fptr->oid_list = oid_list;
  m_fptr->pref_db= pref_db;

  /* open the header file */
  m_fptr->hfile = fopen(hname,RBSTR);

  /* ncbl2_ranlib is used for all BLAST2.0 access */
  m_fptr->ranlib = ncbl2_ranlib;
  m_fptr->bl_format_ver = dbformat;

  if (!m_fptr->lib_aa) {
    if (oid_seqs > 0) {m_fptr->getlib = ncbl2_getlibn_o; }
    else { m_fptr->getlib = ncbl2_getlibn;}
    m_fptr->sascii = nascii;
  }
  else {
    if (oid_seqs > 0) { m_fptr->getlib = ncbl2_getliba_o; }
    else { m_fptr->getlib = ncbl2_getliba; }
    m_fptr->sascii = aascii;
  }

  /* open the sequence file */

#if defined (USE_MMAP) 
  m_fptr->mm_flg=((m_fptr->mmap_fd=open(sname,O_RDONLY))>=0);
  if(fstat(m_fptr->mmap_fd, &statbuf) < 0) {
    fprintf(stderr," cannot fstat %s",sname);
    perror("...");
    m_fptr->mm_flg = 0;
  }
  else {
    m_fptr->st_size = statbuf.st_size;
    if((m_fptr->mmap_base = 
	mmap(NULL, m_fptr->st_size, PROT_READ,
	     MAP_FILE | MAP_SHARED, m_fptr->mmap_fd, 0)) == (char *) -1) {
      fprintf(stderr," cannot mmap %s",sname);
      perror("...");
      m_fptr->mm_flg = 0;
    }  
    else {
      m_fptr->mmap_addr = m_fptr->mmap_base;
      m_fptr->mm_flg = 1;
    }
  }
  /* regardless, close the open()ed version */
  close(m_fptr->mmap_fd);
#else
  m_fptr->mm_flg = 0;
#endif

  if  (!m_fptr->mm_flg) {
    m_fptr->libf = fopen(sname,RBSTR);
  }

  return load_ncbl2(m_fptr, ifile, dbformat, dbtype);
}

struct lmf_str 
*load_ncbl2(struct lmf_str *m_fptr, FILE *ifile, int dbformat, int dbtype) {
  int title_len;
  char *title_str=NULL;
  int date_len;
  char *date_str=NULL;
  long ltmp;
  int64_t l8tmp;
  int i, tmp;
  unsigned int *f_pos_arr;

  src_uint4_read(ifile,(unsigned *)&title_len);

  if (title_len > 0) {
    if ((title_str = calloc((size_t)title_len+1,sizeof(char)))==NULL) {
      fprintf(stderr," cannot allocate title string (%d)\n",title_len);
      goto error_r;
    }
    fread(title_str,(size_t)1,(size_t)title_len,ifile);
  }
  
  src_uint4_read(ifile,(unsigned *)&date_len);

  if (date_len > 0) {
    if ((date_str = calloc((size_t)date_len+1,sizeof(char)))==NULL) {
      fprintf(stderr," cannot allocate date string (%d)\n",date_len);
      goto error_r;
    }
    fread(date_str,(size_t)1,(size_t)date_len,ifile);
  }
  
  m_fptr->lpos = 0;
  src_uint4_read(ifile,(unsigned *)&m_fptr->max_cnt);
  
  if (dbformat == FORMATDBV3) {
    src_long4_read(ifile,&ltmp);
    m_fptr->tot_len = ltmp;
  }
  else {
    ncbi_long8_read(ifile,&l8tmp);
    m_fptr->tot_len = l8tmp;
  }

  src_long4_read(ifile,&ltmp);
  m_fptr->max_len = ltmp;

  /* currently we are not using this information, but perhaps later */
  if (title_str!=NULL) free(title_str);
  if (date_str!=NULL) free(date_str);

#ifdef DEBUG
    fprintf(stderr,"%s format: BL2 (%s)  max_cnt: %d, totlen: %lld, maxlen %ld\n",
	    m_fptr->lb_name,m_fptr->mm_flg ? "mmap" : "fopen", 
	    m_fptr->max_cnt,m_fptr->tot_len,m_fptr->max_len);
#endif

  /* allocate and read hdr indexes */
  if ((f_pos_arr=(unsigned int *)calloc((size_t)m_fptr->max_cnt+1,sizeof(int)))==NULL) {
      fprintf(stderr," cannot allocate tmp header pointers\n");
      goto error_r;
    }

  if ((m_fptr->d_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
      fprintf(stderr," cannot allocate header pointers\n");
      goto error_r;
    }

  /* allocate and read sequence offsets */
  if ((m_fptr->s_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
      fprintf(stderr," cannot allocate sequence pointers\n");
      goto error_r;
    }

  if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
    fprintf(stderr," error reading hdr offsets: %s\n",m_fptr->lb_name);
    goto error_r;
  }

  for (i=0; i<=m_fptr->max_cnt; i++)
#ifdef IS_BIG_ENDIAN
    m_fptr->d_pos_arr[i] = f_pos_arr[i];
#else
    m_fptr->d_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
#endif

  if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
    fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
    goto error_r;
  }
  for (i=0; i<=m_fptr->max_cnt; i++) {
#ifdef IS_BIG_ENDIAN
    m_fptr->s_pos_arr[i] = f_pos_arr[i];
#else
    m_fptr->s_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
#endif
  }

  if (dbtype == NTFORMAT) {
    /* allocate and ambiguity  offsets */
    if ((m_fptr->a_pos_arr=(MM_OFF *)calloc((size_t)m_fptr->max_cnt+1,sizeof(MM_OFF)))==NULL) {
      fprintf(stderr," cannot allocate sequence pointers\n");
      goto error_r;
    }

    /*
    for (i=0; i<=m_fptr->max_cnt; i++) src_uint4_read(ifile,&m_fptr->a_pos_arr[i]);
    */

    if (fread(f_pos_arr,(size_t)4,m_fptr->max_cnt+1,ifile)!=m_fptr->max_cnt+1) {
      fprintf(stderr," error reading seq offsets: %s\n",m_fptr->lb_name);
      goto error_r;
    }
    for (i=0; i<=m_fptr->max_cnt; i++) {
#ifdef IS_BIG_ENDIAN
      m_fptr->a_pos_arr[i] = f_pos_arr[i];
#else
      m_fptr->a_pos_arr[i] = bl2_uint4_cvt(f_pos_arr[i]);
#endif
    }
  }

  /*
  for (i=0; i < min(m_fptr->max_cnt,10); i++) {
    fprintf(stderr,"%d: %d %d %d\n",i,m_fptr->s_pos_arr[i],m_fptr->a_pos_arr[i],m_fptr->d_pos_arr[i]);
  }
  */

  /* all done with ifile, close it */
  fclose(ifile);
  free(f_pos_arr);

  if (!m_fptr->mm_flg) {
    tmp = fgetc(m_fptr->libf);
    if (tmp!=NULLB)
      fprintf(stderr," phase error: %d:%d found\n",0,tmp);
  }

  m_fptr->bl_lib_pos = 1;
  amb_cnt = 0;

  return m_fptr;

 error_r:
  /* here if failure after m_fptr allocated */
  free(m_fptr);
  return NULL;
}

/* **************************************************************** */
/* close the library, free s_pos_arr, a_pos_arr, but save file info */
/* **************************************************************** */

void ncbl2_closelib(struct lmf_str *m_fptr)
{
  if (m_fptr->tmp_buf != NULL) {
    free(m_fptr->tmp_buf);
    m_fptr->tmp_buf_max = 0;
  }

  if (m_fptr->s_pos_arr !=NULL) {
    free(m_fptr->s_pos_arr);
    m_fptr->s_pos_arr = NULL;
  }
  if (m_fptr->a_pos_arr!=NULL) {
    free(m_fptr->a_pos_arr);
    m_fptr->a_pos_arr = NULL;
  }

  if (m_fptr->hfile !=NULL ) {
    fclose(m_fptr->hfile); m_fptr->hfile=NULL;
    free(m_fptr->d_pos_arr); m_fptr->d_pos_arr = NULL;
  }

  if (m_fptr->oid_list != NULL) {
    free(m_fptr->oid_list); m_fptr->oid_list = NULL;
    m_fptr->oid_seqs = m_fptr->max_oid = 0;
  }

#ifdef use_mmap
  if (m_fptr->mm_flg) {
    munmap(m_fptr->mmap_base,m_fptr->st_size);
    m_fptr->mmap_fd = -1;
  }
  else 
#endif
    if (m_fptr->libf !=NULL ) {fclose(m_fptr->libf); m_fptr->libf=NULL;}
}

/* **************************************************************** */
/* read a protein sequence using OID offsets */
/* **************************************************************** */

int
ncbl2_getliba_o(unsigned char *seq,
		int maxs,
		char *libstr,
		int n_libstr,
		fseek_t *libpos,
		int *lcont,
		struct lmf_str *m_fd,
		long *l_off)
{
  int tpos;
  unsigned int t_mask, t_shift, oid_mask;
  
  /* get to the next valid pointer */
  
  for ( tpos = m_fd->lpos ;tpos <= m_fd->max_oid; tpos++) {
    t_mask = tpos / 32;
    t_shift = 31 - (tpos % 32);
    if ((oid_mask = m_fd->oid_list[t_mask])==0) {  continue; }

    if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
      if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
      m_fd->lpos = tpos;	/* already bumped up */
      m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
      return ncbl2_getliba(seq, maxs, libstr, n_libstr,
			   libpos, lcont, m_fd, l_off);
    }
  }
  return -1;
}

/* **************************************************************** */
/* read a protein sequence */
/* **************************************************************** */

int
ncbl2_getliba(unsigned char *seq,
	      int maxs,
	      char *libstr,
	      int n_libstr,
	      fseek_t *libpos,
	      int *lcont,
	      struct lmf_str *m_fd,
	      long *l_off)
{
  unsigned char *sptr, *dptr;
  int s_chunk, d_len, lib_cnt;
  long seqcnt;
  long tmp;
  static long seq_len;
#if defined(DEBUG) || defined(PCOMPLIB)
  int gi, my_db, taxid;
  char acc[20], title[21], name[20];
#endif
  
  *l_off = 1;

  lib_cnt = m_fd->lpos;
  *libpos = (fseek_t)m_fd->lpos;

  if (*lcont==0) {
    if (lib_cnt >= m_fd->max_cnt) return -1;	/* no more sequences */
    seq_len = m_fd->s_pos_arr[lib_cnt+1] - m_fd->s_pos_arr[lib_cnt]; /* value is +1 off to get the NULL */
    if (m_fd->mm_flg) m_fd->mmap_addr = m_fd->mmap_base+m_fd->s_pos_arr[lib_cnt];
#if !defined(DEBUG) && !defined(PCOMPLIB)
    libstr[0]='\0';
#else
    /* get the name from the header file */
    fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);

    if (m_fd->bl_format_ver == FORMATDBV3) {
      d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
      fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
      libstr[d_len]='\0';
    }
    else {
      d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
      fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
      parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
			&gi, &my_db, acc, name, title, 20, &taxid);
      sprintf(libstr,"gi|%d",gi);
    }
#endif
  }
  if (seq_len <= maxs) { /* sequence fits */
    seqcnt = seq_len;
    m_fd->lpos++;
    *lcont = 0;
  }
  else {		/* doesn't fit */
    seqcnt = maxs-1;
    (*lcont)++;
  } 

  if (m_fd->mm_flg) sptr = (unsigned char *)m_fd->mmap_addr;
  else {
    if ((tmp=fread(seq,(size_t)1,(size_t)seq_len,m_fd->libf))!=(size_t)seq_len) {
      fprintf(stderr," could not read sequence record: %lld %ld != %ld\n",
	      *libpos,tmp,seq_len);
      goto error; 
    }
    sptr = seq;
  }
  if (seq_len <= maxs) {seqcnt = --seq_len;}

  /* everything is ready, set up dst. pointer, seq_len */
  dptr = seq;

  if (aa_b2toa[sptr[seq_len-1]]=='*') seq_len--;
  s_chunk = seqcnt/16;
  while (s_chunk-- > 0) {
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
    *dptr++ = aa_btof[*sptr++];
  }
  while (dptr < seq+seqcnt) *dptr++ = aa_btof[*sptr++];

  if (m_fd->mm_flg) m_fd->mmap_addr = (char *)sptr;

  /* we didn't get it all, so reset for more */
  if (*lcont) seq_len -= seqcnt;

  seq[seqcnt]= EOSEQ;
  return (seqcnt);
  
error:	fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
  fflush(stderr);
  return (-1);
}

/* **************************************************************** */
/* read a DNA sequence using OID offsets */
/* **************************************************************** */

int 
ncbl2_getlibn_o(unsigned char *seq,
		int maxs,
		char *libstr,
		int n_libstr,
		fseek_t *libpos,
		int *lcont,
		struct lmf_str *m_fd,
		long *l_off)
{
  int tpos;
  unsigned int t_mask, t_shift, oid_mask;
  
  /* get to the next valid pointer */
  
  for (tpos = m_fd->lpos; tpos <= m_fd->max_oid; tpos++) {
    t_mask = tpos / 32;
    t_shift = 31 - (tpos % 32);
    if ((oid_mask = m_fd->oid_list[t_mask])==0) {  continue; }

    if ((bl2_uint4_cvt(oid_mask) & 0x1 << t_shift)) {
      if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[tpos],0);
      m_fd->lpos = tpos;	/* already bumped up */
      m_fd->bl_lib_pos = m_fd->s_pos_arr[tpos];
      return ncbl2_getlibn(seq, maxs, libstr, n_libstr,
			   libpos, lcont, m_fd, l_off);
    }
  }
  return -1;
}

static char tmp_amb[4096];

/* **************************************************************** */
/* read a DNA sequence */
/* **************************************************************** */

int
ncbl2_getlibn(unsigned char *seq,
	      int maxs,
	      char *libstr,
	      int n_libstr,
	      fseek_t *libpos,
	      int *lcont,
	      struct lmf_str *m_fd,
	      long *l_off)
{
  unsigned char *sptr, *tptr, stmp;
  long seqcnt;
  int s_chunk, lib_cnt;
  size_t tmp;
  char ch;
  static long seq_len;
  static int c_len,c_pad;
  int c_len_set, d_len;
#if defined(DEBUG) || defined(PCOMPLIB)
  int gi, my_db, taxid;
  char acc[20], title[21], name[20];
#endif

  *l_off = 1;

  lib_cnt = m_fd->lpos;
  *libpos = (fseek_t)lib_cnt;
  if (*lcont==0) {	/* not a continuation of previous */
    if (lib_cnt >= m_fd->max_cnt) return (-1);
    c_len = m_fd->a_pos_arr[lib_cnt]- m_fd->s_pos_arr[lib_cnt];
    if (!m_fd->mm_flg) {
      if (m_fd->bl_lib_pos != m_fd->s_pos_arr[lib_cnt]) { /* are we positioned to read? */
	amb_cnt++;
	if ((m_fd->bl_lib_pos - m_fd->s_pos_arr[lib_cnt]) < sizeof(tmp_amb)) {
	  /* jump over amb_ray */
	  fread(tmp_amb,(size_t)1,(size_t)(m_fd->s_pos_arr[lib_cnt]-m_fd->bl_lib_pos),m_fd->libf);
	}
	else {	/* fseek over amb_ray */
	  fseek(m_fd->libf,m_fd->s_pos_arr[lib_cnt],0);
	}
	m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
      }
    }
    else m_fd->mmap_addr = m_fd->mmap_base + m_fd->s_pos_arr[lib_cnt];
#if !defined(DEBUG) && !defined(PCOMPLIB)
    libstr[0]='\0';
#else
    /* get the name from the header file */
    fseek(m_fd->hfile,m_fd->d_pos_arr[lib_cnt],0);

    if (m_fd->bl_format_ver == FORMATDBV3) {
      d_len = min(n_libstr-1,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
      fread(libstr,(size_t)1,(size_t)d_len,m_fd->hfile);
      libstr[d_len]='\0';
    }
    else {
      d_len = min(m_fd->tmp_buf_max,m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt]-1);
      fread(m_fd->tmp_buf,(size_t)1,(size_t)d_len,m_fd->hfile);
      parse_fastadl_asn((unsigned char *)m_fd->tmp_buf, (unsigned char *)m_fd->tmp_buf+d_len,
			&gi, &my_db, acc, name, title, 20, &taxid);
      sprintf(libstr,"gi|%d",gi);
    }
#endif
  }			/* end of *lcont==0 */

  /* To avoid the situation where c_len <= 1; we must anticipate what
     c_len will be after this pass.  If it will be <= 64, back off this
     time so next time it will be > 64 */

  seq_len = c_len*4;

  if ((seq_len+4 > maxs) && (seq_len+4 - maxs  <= 256)) {
    /* we won't be done but we will have less than 256 to go */
    c_len -= 64; seq_len -= 256; c_len_set = 1; maxs -= 256;}
  else c_len_set = 0;

  /*
  fprintf(stderr," lib_cnt: %d %d %d %d\n",lib_cnt,c_len,seq_len,maxs);
  */

  /* does the rest of the sequence fit? */
  if (seq_len <= maxs-4 && !c_len_set) {
    seqcnt = c_len;
    if (!m_fd->mm_flg) {
      if ((tmp=fread(seq,(size_t)1,(size_t)seqcnt,m_fd->libf))!=(size_t)seqcnt) {
	fprintf(stderr,
		" could not read sequence record: %s %lld %ld != %ld: %d\n",
		libstr,*libpos,tmp,seqcnt,*seq);
	goto error; 
      }
      m_fd->bl_lib_pos += tmp;
      sptr = seq + seqcnt;
    }
    else sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);

    *lcont = 0;		/* this is the last chunk */
    lib_cnt++;		/* increment to the next sequence */
    /* the last byte is either '0' (no remainder) or the last 1-3 chars and the remainder */
    c_pad = *(sptr-1);
    c_pad &= 0x3;	/* get the last (low) 2 bits */
    seq_len -= (4 - c_pad);	/* if the last 2 bits are 0, its a NULL byte */
  }
  else {	/* get the next chunk, but more to come */
    seqcnt = ((maxs+3)/4)-1;
    if (!m_fd->mm_flg) {
      if ((tmp=fread(seq,(size_t)1,(size_t)(seqcnt),m_fd->libf))!=(size_t)(seqcnt)) {
	fprintf(stderr," could not read sequence record: %lld %ld/%ld\n",
		*libpos,tmp,seqcnt);
	goto error;
      }
      m_fd->bl_lib_pos += tmp;
      sptr = seq + seqcnt;
    }
    else {
      sptr = (unsigned char *)(m_fd->mmap_addr+seqcnt);
      m_fd->mmap_addr += seqcnt;
    }
    seq_len = 4*seqcnt;
    c_len -= seqcnt;
    if (c_len_set) {c_len += 64; maxs += 256;}
    (*lcont)++;
/*  hopefully we don't need this because of c_len -= 64. */
/*
    if (c_len == 1) {
#if !defined (USE_MMAP)
      c_pad = fgetc(m_fd->libf);
      *sptr=c_pad;
#else
      c_pad = *m_fd->mmap_addr++;
      sptr = m_fd->mmap_addr;
#endif
      c_pad &= 0x3;
      seq_len += c_pad;
      seqcnt++;
      lib_cnt++;
      *lcont = 0;
    }
*/
  }

  /* point to the last packed byte and to the end of the array
     seqcnt is the exact number of bytes read
     tptr points to the destination, use multiple of 4 to simplify math
     sptr points to the source, note that the last byte will be read 4 cycles
     before it is written
     */
  
  tptr = seq + 4*seqcnt;
  s_chunk = seqcnt/8;
  while (s_chunk-- > 0) {
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
  }
  while (tptr>seq) {
    stmp = *--sptr;
    *--tptr = (stmp&3) +1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
    *--tptr = ((stmp >>= 2)&3)+1;
  }
  /*
    for (sptr=seq; sptr < seq+seq_len; sptr++) {
    printf("%c",nt[*sptr]);
    if ((int)(sptr-seq) % 60 == 59) printf("\n");
    }
    printf("\n");
    */

  m_fd->lpos = lib_cnt;
  if (seqcnt*4 >= seq_len) {	/* there was enough room */
    seq[seq_len]= EOSEQ;
    /* printf("%d\n",seq_len); */
    return seq_len;
  }
  else {				/* not enough room */
    seq[seqcnt*4]=EOSEQ;
    seq_len -= 4*seqcnt;
    return (4*seqcnt);
  }
  
error:	fprintf(stderr," error reading %s at %lld\n",libstr,*libpos);
  fflush(stderr);
  return (-1);
}

                /*   0     1     2     3    4     5     6    7
		     8     9    10    11   12    13    14   15
		     16    17 */
static char
*db_type_arr[] = {"lcl","gib","gim","gii","gb","emb","pir","sp",
		  "pat","ref","gnl","gi","dbj","prf","pdb","tpg",
		  "tpe","tpd"};

/* **************************************************************** */
/* read a description, position for sequence */
/* **************************************************************** */
void
ncbl2_ranlib(char *str,
	     int cnt,
	     fseek_t libpos,
	     char *libstr,
	     struct lmf_str *m_fd)
{
  int llen, lib_cnt;
  char *bp;
  unsigned char *my_buff=NULL;
  char descr[2048];
  unsigned char *abp;
  int gi, taxid;
  int my_db;
  char db[5], acc[20], name[20];
  char title[1024];
  int have_my_buff=0;
  int have_descr = 0;

  lib_cnt = (int)libpos;
  llen = m_fd->d_pos_arr[lib_cnt+1]-m_fd->d_pos_arr[lib_cnt];

  fseek(m_fd->hfile,m_fd->d_pos_arr[libpos],0);

  if (m_fd->bl_format_ver == FORMATDBV3) {
    if (llen >= cnt) llen = cnt-1;
    fread(str,(size_t)1,(size_t)(llen),m_fd->hfile);
  }
  else {
    if (llen >= m_fd->tmp_buf_max) {
      if ((my_buff=(unsigned char *)calloc(llen,sizeof(char)))==NULL) {
	fprintf(stderr," cannot allocate ASN.1 buffer: %d\n",llen);
	my_buff = (unsigned char *)m_fd->tmp_buf;
	llen = m_fd->tmp_buf_max;
      }
      else have_my_buff = 1;
    }
    else { 
      my_buff = (unsigned char *)m_fd->tmp_buf;
    }
    abp = my_buff;
    fread(my_buff,(size_t)1,llen,m_fd->hfile);

    do {
      abp = parse_fastadl_asn(abp, my_buff+llen,
			      &gi, &my_db, acc, name,
			      title, sizeof(title), &taxid);

      if (gi > 0) {
	sprintf(descr,"gi|%d|%s|%s|%s ",gi,db_type_arr[my_db],acc,name);
      }
      else {
	if (acc[0] != '\0') sprintf(descr,"%s ",acc);
	else descr[0] = '\0';
	if (name[0] != '\0' && strcmp(name,"BL_ORD_ID")!=0) sprintf(descr+strlen(descr),"%s ", name);
      }
      if (my_db == 0 || m_fd->pref_db < 0) {
	if (!have_descr) {
	  strncpy(str,descr,cnt-1);
	  have_descr = 1;
	}
	else {
	  strncat(str,"\001",cnt-strlen(str)-1);
	  strncat(str,descr,cnt-strlen(str)-1);
	}
	strncat(str,title,cnt-strlen(str)-1);
	if (strlen(str) >= cnt-1) break;
      }
      else if (m_fd->pref_db == my_db) {
	have_descr = 1;
	strncpy(str,descr,cnt-1);
	strncat(str,title,cnt-strlen(str)-1);
	break;
      }
    } while (abp);

    if (!have_descr) {
      strncpy(str,descr,cnt-1);
      strncat(str,descr,cnt-strlen(str)-1);
    }

    if (have_my_buff) free(my_buff);
  }

  str[cnt-1]='\0';

  bp = str;
  while((bp=strchr(bp,'\001'))!=NULL) {*bp++=' ';}

  if (!m_fd->mm_flg) fseek(m_fd->libf,m_fd->s_pos_arr[libpos],0);

  m_fd->lpos = lib_cnt;
  m_fd->bl_lib_pos = m_fd->s_pos_arr[lib_cnt];
}

unsigned int bl2_uint4_cvt(unsigned int val)
{
  unsigned int res;
#ifdef IS_BIG_ENDIAN
  return val;
#else /* it better be LITTLE_ENDIAN */
  res = ((val&255)*256)+ ((val>>8)&255);
  res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
  return res;
#endif
}  

unsigned int bl2_long4_cvt(long val)
{
  int val4;
  unsigned int res;
#ifdef IS_BIG_ENDIAN
  val4 = val;
  return val4;
#else /* it better be LITTLE_ENDIAN */
  res = ((val&255)*256)+ ((val>>8)&255);
  res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
  return res;
#endif
}  

uint64_t bl2_long8_cvt(uint64_t val)
{
  uint64_t res;
#ifdef IS_BIG_ENDIAN
  return val;
#else /* it better be LITTLE_ENDIAN */
  res = ((val&255)*256)+ ((val>>8)&255);
  res = (res<<16) + (((val>>16)&255)*256) + ((val>>24)&255);
#ifdef BIG_LIB64
  res = (res<<16) + (((val>>32)&255)*256) + ((val>>40)&255);
  res = (res<<16) + (((val>>48)&255)*256) + ((val>>56)&255);
#else
  fprintf(stderr,"Cannot use bl2_long8_cvt without 64-bit longs\n");
  exit(1);
#endif
  return res;
#endif
}  

void src_int4_read(FILE *fd,  int *val)
{
#ifdef IS_BIG_ENDIAN
  fread((char *)val,(size_t)4,(size_t)1,fd);
#else
  unsigned char b[4];

  fread((char *)&b[0],(size_t)1,(size_t)4,fd);
  *val = 0;
  *val = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
	  +(int)b[3];
#endif
}

void src_long4_read(FILE *fd,  long *valp)
{
  int val4;
#ifdef IS_BIG_ENDIAN
  fread(&val4,(size_t)4,(size_t)1,fd);
  *valp = val4;
#else
  unsigned char b[4];

  fread((char *)&b[0],(size_t)1,(size_t)4,fd);
  val4 = 0;
  val4 = (int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
	  +(int)b[3];
  *valp = val4;
#endif
}

void src_uint4_read(FILE *fd,  unsigned int *valp)
{
#ifdef IS_BIG_ENDIAN
  fread(valp,(size_t)4,(size_t)1,fd);
#else
  unsigned char b[4];

  fread((char *)&b[0],(size_t)1,(size_t)4,fd);
  *valp = 0;
  *valp = (unsigned int)((int)((int)(b[0]<<8)+(int)b[1]<<8)+(int)b[2]<<8)
	  +(int)b[3];
#endif
}

void src_long8_read(FILE *fd,  int64_t *val)
{
#ifdef IS_BIG_ENDIAN
  fread((void *)val,(size_t)8,(size_t)1,fd);
#else
  int64_t val_h;
  uint64_t val_l;
  unsigned char b[8];

  fread((char *)&b[0],(size_t)1,(size_t)8,fd);

  /* modified to work with both 32-bit and 64-bit native ints */
  val_h = (long)((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8)+(long)b[3];
  val_l = (long)((long)((long)(b[4]<<8)+(long)b[5]<<8)+(long)b[6]<<8)+(long)b[7];
  *val = val_h * 4294967296LL + val_l;

  /*
  *val = 0;
  *val = (long)((((((long)((long)(b[0]<<8)+(long)b[1]<<8)+(long)b[2]<<8)
		  +(long)b[3]<<8)+(long)b[4]<<8)+(long)b[5]<<8)
		+(long)b[6]<<8)+(long)b[7];
  */
#endif
}

void ncbi_long8_read(FILE *fd,  int64_t *val)
{
  unsigned char b[8];

  fread((char *)&b[0],(size_t)1,(size_t)8,fd);
  *val = 0;
  *val = (long)((((((long)((long)(b[7]<<8)+(long)b[6]<<8)+(long)b[5]<<8)
		  +(long)b[4]<<8)+(long)b[3]<<8)+(long)b[2]<<8)
		+(long)b[1]<<8)+(long)b[0];
}

void src_char_read(FILE *fd, char *val)
{
  fread(val,(size_t)1,(size_t)1,fd);
}

void src_fstr_read(FILE *fd, char *val,  int slen)
{
  fread(val,(size_t)slen,(size_t)1,fd);
}

void
newname(char *nname, char *oname, char *suff, int maxn)
{
  strncpy(nname,oname,maxn-1);
  strncat(nname,".",1);
  strncat(nname,suff,maxn-strlen(nname));
}

#define ASN_SEQ 0x30
#define ASN_IS_BOOL 1
#define ASN_IS_INT 2
#define ASN_IS_STR 26

unsigned char *
get_asn_int(unsigned char *abp, int *val) {

  int v_len, v;

  v = 0;
  if (*abp++ != ASN_IS_INT) { /* check for int */
    fprintf(stderr," int missing\n");
  }
  else {
    v_len = *abp++;
    while (v_len-- > 0) {
      v *= 256;
      v += *abp++;
    }
    abp += 2;	/* skip over null's */
  }
  *val = v;
  return abp;
}

unsigned char *
get_asn_text(unsigned char *abp, char *text, int t_len) {
  int tch, at_len;

  text[0] = '\0';
  if (*abp++ != ASN_IS_STR) { /* check for str */
    fprintf(stderr," str missing\n");
  }
  else {
    if ((tch = *abp++) > 128) {	/* string length is in next bytes */
      tch &= 0x7f;	/* get number of bytes for len */
      at_len = 0;
      while (tch-- > 0) { at_len = (at_len << 8) + *abp++;}
    }
    else {
      at_len = tch;
    }

    if ( at_len < t_len-1) {
      memcpy(text, abp, at_len);
      text[at_len] = '\0';
    }
    else {
      memcpy(text, abp, t_len-1);
      text[t_len-1] = '\0';
    }
    abp += at_len + 2;
  }
  return abp;
}

/* something to try to skip over stuff we don't want */
unsigned char *
get_asn_junk(unsigned char *abp) {

  int seq_cnt = 0;
  int tmp;
  char string[256];

  while (*abp) {
    if ( *abp  == ASN_SEQ) { abp += 2; seq_cnt++;}
    else if ( *abp == ASN_IS_BOOL ) {abp = get_asn_int(abp, &tmp);}
    else if ( *abp == ASN_IS_INT ) {abp = get_asn_int(abp, &tmp);}
    else if ( *abp == ASN_IS_STR ) {abp = get_asn_text(abp, string, sizeof(string)-1);}
  }

  while (seq_cnt-- > 0) abp += 2;
  return abp;
}

unsigned char *
get_asn_textseq_id(unsigned char *abp, 
		   char *name, char *acc)
{
  char release[20], ver_str[10];
  int version;
  int seqcnt = 0;

  ver_str[0]='\0';

  if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}

  while (*abp) {
    switch (*abp) {
    case 0xa0:
      abp = get_asn_text(abp+2, name, 20);
      break;
    case 0xa1:
      abp = get_asn_text(abp+2, acc, 20);
      break;
    case 0xa2:
      abp = get_asn_text(abp+2, release, sizeof(release));
      break;
    case 0xa3:
      abp = get_asn_int(abp+2, &version);
      sprintf(ver_str,".%d",version);
      break;
    default: abp += 2;
    }
  }
  while (seqcnt-- > 0) abp += 4;
  strncat(acc,ver_str,20-strlen(acc));
  acc[19]='\0';
  return abp;	/* skip 2 NULL's */
}

unsigned char *
get_asn_local_id(unsigned char *abp, char *acc)
{
  int seqcnt = 0;

  if (*abp == ASN_SEQ) { abp += 2; seqcnt++;}

  abp = get_asn_text(abp+2, acc, 20);

  while (seqcnt-- > 0) abp += 4;
  acc[19]='\0';
  return abp;	/* skip 2 NULL's */
}

unsigned char *
get_asn_dbtag(unsigned char *abp, char *name, char *str, int *id_p) {

  if (*abp == ASN_SEQ) { abp += 2;}

  if (*abp == 0xa0) {  /* get db */
    abp = get_asn_text(abp+2, name, 20);
  }
  else {
    fprintf(stderr," missing dbtag:db %d %d\n",abp[0],abp[1]);
    abp += 2;
  }

  if (*abp == 0xa1) {  /* get tag */
    abp += 2;
    abp += 2; /* skip over id */
    if (*abp == 2) abp = get_asn_int(abp,id_p);
    else abp = get_asn_text(abp+2, str, 20);
  }
  else {
    fprintf(stderr," missing dbtag:tag %2x %2x\n",abp[0],abp[1]);
    abp += 2;
  }
  return abp+2;	/* skip 2 NULL's */
}

unsigned char *
get_asn_pdb_id(unsigned char *abp, char *acc, char *chain)
{
  int ichain, seq_cnt=0;

  if (*abp == ASN_SEQ) { abp += 2; seq_cnt++;}

  while (*abp) {
    switch (*abp) {
    case 0: abp += 2; break;
    case 0xa0:	/* mol-id */
      abp = get_asn_text(abp+2, acc, 20);
      break;
    case 0xa1:
      abp = get_asn_int(abp+2, &ichain);
      chain[0] = ichain;
      chain[1] = '\0';
      break;
    case 0xa2:	/* ignore date - scan until NULL's */
      while (*abp++) {}
      abp += 2;		/* skip the NULL's */
      break;
    default: abp+=2;
    }
  }
  while (seq_cnt-- > 0) {abp += 4;}
  return abp;
}

#define ASN_TYPE_MASK 31

unsigned char
*get_asn_seqid(unsigned char *abp,
	       int *gi_p, int *db, char *acc, char *name) {

  int db_type, itmp, seq_cnt=0;

  *gi_p = 0;

  if (*abp != ASN_SEQ) {
    fprintf(stderr, "seqid - missing SEQ 1: %2x %2x\n",abp[0], abp[1]);
    return abp;
  }
  else { abp += 2; seq_cnt++;}

  db_type = (*abp & ASN_TYPE_MASK);

  if (db_type == 11) { /* gi */
    abp = get_asn_int(abp+2,gi_p);
  }
  
  while (*abp == ASN_SEQ) {abp += 2; seq_cnt++;}

  db_type = (*abp & ASN_TYPE_MASK);
  if (db_type > 17) {db_type = 0;}
  *db = db_type;

  switch(db_type) {
  case 0: 
    abp = get_asn_local_id(abp+2, acc);
    break;
  case 1:
  case 2:
    abp = get_asn_int(abp+2,&itmp);
    abp += 2;
    break;
  case 11:
    abp = get_asn_int(abp+2,&itmp);
    break;
  case 4:
  case 5:
  case 6:
  case 7:
  case 9:
  case 12:
  case 13:
  case 15:
  case 16:
  case 17:
    abp = get_asn_textseq_id(abp+2,name,acc);
    break;
  case 10:
    abp = get_asn_dbtag(abp+2,name,acc,&itmp);
  case 14:
    abp = get_asn_pdb_id(abp+2,acc,name);
    break;
  default: abp += 2;
  }
  
  while (seq_cnt-- > 0) { abp += 4;}
  return abp; /* skip over 2 NULL's */
}

#define ASN_FADL_TITLE 0xa0
#define ASN_FADL_SEQID 0xa1
#define ASN_FADL_TAXID 0xa2
#define ASN_FADL_MEMBERS 0xa3
#define ASN_FADL_LINKS 0xa4
#define ASN_FADL_OTHER 0xa5

unsigned char *
parse_fastadl_asn(unsigned char *asn_buff, unsigned char *asn_max,
		  int *gi_p, int *db, char *acc,
		  char *name, char *title, int t_len, int *taxid_p) {
  unsigned char *abp;
  char tmp_db[4], tmp_acc[32], tmp_name[32];
  int this_db;
  int seq_cnt = 0;
  int tmp_gi;

  acc[0] = name[0] = db[0] = title[0] = '\0';

  abp = asn_buff;
  while ( abp < asn_max && *abp) {
    if (*abp == ASN_SEQ) { abp += 2; seq_cnt++; }
    else if (*abp == ASN_FADL_TITLE) {
      abp = get_asn_text(abp+2, title, t_len);
    }
    else if (*abp == ASN_FADL_SEQID ) {
      abp = get_asn_seqid(abp+2, gi_p, db, acc, name);
      if (*db > 17) *db = 0;
    }
    else if (*abp == ASN_FADL_TAXID ) {
      abp = get_asn_int(abp+2, taxid_p);
    }
    else if (*abp == ASN_FADL_MEMBERS) {
      abp = get_asn_junk(abp+2);
    }
    else if (*abp == ASN_FADL_LINKS ) {
      abp = get_asn_junk(abp+2);
    }
    else if (*abp == ASN_FADL_OTHER ) {
      abp = get_asn_junk(abp+2);
    }
    else {
      /*       fprintf(stderr, " Error - missing ASN.1 %2x:%2x:%2x:%2x\n", 
	       abp[-2],abp[-1],abp[0],abp[1]); */
      abp += 2;
    }
  }
  while (abp < asn_max && *abp == '\0'  ) abp++;
  if (abp >= asn_max) return NULL;
  else return abp;
}


void
parse_pal(char *dname, char *msk_name,
	  int *oid_seqs, int *max_oid,
	  FILE *fd) {

  char line[MAX_STR];

  while (fgets(line,sizeof(line),fd)) {
    if (line[0] == '#') continue;

    if (strncmp(line, "DBLIST", 6)==0) {
      sscanf(line+7,"%s",dname);
    }
    else if (strncmp(line, "OIDLIST", 7)==0) {
      sscanf(line+8,"%s",msk_name);
    }
    else if (strncmp(line, "NSEQ", 4)==0) {
      sscanf(line+5,"%d",oid_seqs);
    }
    else if (strncmp(line, "MAXOID", 6)==0) {
      sscanf(line+7,"%d",max_oid);
    }
  }
}
