/*   cleanasn.c
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*            National Center for Biotechnology Information (NCBI)
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government do not place any restriction on its use or reproduction.
*  We would, however, appreciate having the NCBI and the author cited in
*  any work or product based on this material
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
* ===========================================================================
*
* File Name:  cleanasn.c
*
* Author:  Jonathan Kans
*
* Version Creation Date:   10/19/99
*
* $Revision: 6.5 $
*
* File Description: 
*
* Modifications:  
* --------------------------------------------------------------------------
* Date     Name        Description of modification
* -------  ----------  -----------------------------------------------------
*
*
* ==========================================================================
*/

#include <ncbi.h>
#include <objall.h>
#include <objsset.h>
#include <objfdef.h>
#include <objsub.h>
#include <sequtil.h>
#include <sqnutils.h>
#include <explore.h>
#include <toasn3.h>
#include <pmfapi.h>
#include <tax3api.h>
#ifdef INTERNAL_NCBI_CLEANASN
#include <accpubseq.h>
#endif

#define CLEANASN_APP_VER "1.2"

CharPtr CLEANASN_APPLICATION = CLEANASN_APP_VER;

typedef struct cleanflags {
  CharPtr  results;
  CharPtr  outfile;
  CharPtr  clean;
  CharPtr  link;
  CharPtr  feat;
  Boolean  taxon;
} CleanFlagData, PNTR CleanFlagPtr;

static void RemoveFeatUser (
  SeqFeatPtr sfp,
  Pointer userdata
)

{
  if (sfp == NULL) return;
  if (sfp->ext != NULL) {
    sfp->ext = UserObjectFree (sfp->ext);
  }
}

static void RemoveFeatDbxref (
  SeqFeatPtr sfp,
  Pointer userdata
)

{
  DbtagPtr    dbt;
  ValNodePtr  next, vnp;

  if (sfp == NULL) return;
  for (vnp = sfp->dbxref; vnp != NULL; vnp = next) {
    next = vnp->next;
    dbt = (DbtagPtr) vnp->data.ptrvalue;
    DbtagFree (dbt);
    MemFree (vnp);
  }
  sfp->dbxref = NULL;
}

typedef struct dummysmfedata {
  Int4  max;
  Int4  num_at_max;
} DummySmfeData, PNTR DummySmfePtr;

static Boolean LIBCALLBACK CADummySMFEProc (
  SeqFeatPtr sfp,
  SeqMgrFeatContextPtr context
)


{
  DummySmfePtr  dsp;
  Int4          len;

  if (sfp == NULL || context == NULL) return TRUE;
  dsp = context->userdata;
  if (dsp == NULL) return TRUE;

  len = SeqLocLen (sfp->location);
  if (len < dsp->max) {
    dsp->max = len;
    dsp->num_at_max = 1;
  } else if (len == dsp->max) {
    (dsp->num_at_max)++;
  }

  return TRUE;
}

static void RemoveUnnecGeneXref (
  SeqFeatPtr sfp,
  Pointer userdata
)

{
  Int2                 count;
  SeqFeatXrefPtr       curr, next;
  DummySmfeData        dsd;
  SeqMgrFeatContext    fcontext;
  SeqFeatXrefPtr PNTR  last;
  GeneRefPtr           grp, grpx;
  SeqFeatPtr           sfpx;
  CharPtr              syn1, syn2;

  if (sfp == NULL || sfp->data.choice == SEQFEAT_GENE) return;
  grp = SeqMgrGetGeneXref (sfp);
  if (grp == NULL || SeqMgrGeneIsSuppressed (grp)) return;
  sfpx = SeqMgrGetOverlappingGene (sfp->location, &fcontext);
  if (sfpx == NULL || sfpx->data.choice != SEQFEAT_GENE) return;
  grpx = (GeneRefPtr) sfpx->data.value.ptrvalue;
  if (grpx == NULL) return;

  if ((StringDoesHaveText (grp->locus)) &&
       (StringDoesHaveText (grpx->locus))) {
    if ((StringICmp (grp->locus, grpx->locus) != 0)) return;
  } else if (StringDoesHaveText (grp->locus_tag) &&
             StringDoesHaveText (grp->locus_tag)) {
    if ((StringICmp (grp->locus_tag, grpx->locus_tag) != 0)) return;
  } else if (grp->syn != NULL && grpx->syn != NULL) {
    syn1 = (CharPtr) grp->syn->data.ptrvalue;
    syn2 = (CharPtr) grpx->syn->data.ptrvalue;
    if ((StringDoesHaveText (syn1)) && (StringDoesHaveText (syn2))) {
      if ((StringICmp (syn1, syn2) != 0)) return;
    }
  }

  MemSet ((Pointer) &dsd, 0, sizeof (DummySmfeData));
  dsd.max = INT4_MAX;
  dsd.num_at_max = 0;
  count = SeqMgrGetAllOverlappingFeatures (sfp->location, FEATDEF_GENE,
                                           NULL, 0, LOCATION_SUBSET,
                                           (Pointer) &dsd, CADummySMFEProc);

  if (dsd.num_at_max < 2) {
    last = (SeqFeatXrefPtr PNTR) &(sfp->xref);
    curr = sfp->xref;
    while (curr != NULL) {
      next = curr->next;
      if (curr->data.choice == SEQFEAT_GENE) {
        *last = next;
        curr->next = NULL;
        SeqFeatXrefFree (curr);
      } else {
        last = &(curr->next);
      }
      curr = next;
    }
  }
}

static void CleanupOneRecord (
  CharPtr filename,
  Pointer userdata
)

{
  AsnIoPtr      aip;
  CleanFlagPtr  cfp;
  Pointer       dataptr;
  Uint2         datatype;
  Uint2         entityID;
  FILE*         fp;
  Char          path [PATH_MAX];
  CharPtr       ptr;
  SeqEntryPtr   sep;

  if (StringHasNoText (filename)) return;
  cfp = (CleanFlagPtr) userdata;
  if (cfp == NULL) return;

  fp = FileOpen (filename, "r");
  if (fp == NULL) {
    Message (MSG_POSTERR, "Failed to open '%s'", filename);
    return;
  }

  dataptr = ReadAsnFastaOrFlatFile (fp, &datatype, &entityID, FALSE,
                                    FALSE, TRUE, FALSE);

  FileClose (fp);

  path [0] = '\0';
  if (StringDoesHaveText (cfp->outfile)) {
    StringNCpy_0 (path, cfp->outfile, sizeof (path));
  } else if (StringDoesHaveText (cfp->results)) {
    ptr = StringRChr (filename, DIRDELIMCHR);
    if (ptr != NULL) {
      StringNCpy_0 (path, cfp->results, sizeof (path));
      ptr++;
      FileBuildPath (path, NULL, ptr);
    }
  }

  sep = GetTopSeqEntryForEntityID (entityID);
  if (sep != NULL && StringDoesHaveText (path)) {

    if (StringChr (cfp->clean, 'b') != NULL) {
      BasicSeqEntryCleanup (sep);
    }
    if (StringChr (cfp->clean, 's') != NULL) {
      SeriousSeqEntryCleanup (sep, NULL, NULL);
    }

    if (cfp->taxon) {
      Taxon3ReplaceOrgInSeqEntry (sep, FALSE);
    }

    if (StringChr (cfp->link, 'o') != NULL) {
      SeqMgrIndexFeatures (entityID, 0);
      LinkCDSmRNAbyOverlap (sep);
    }
    if (StringChr (cfp->link, 'p') != NULL) {
      SeqMgrIndexFeatures (entityID, 0);
      LinkCDSmRNAbyProduct (sep);
    }
    if (StringChr (cfp->link, 'r') != NULL) {
      SeqMgrIndexFeatures (entityID, 0);
      ReassignFeatureIDs (sep);
    }
    if (StringChr (cfp->link, 'c') != NULL) {
      ClearFeatureIDs (sep);
    }

    if (StringChr (cfp->feat, 'u') != NULL) {
      VisitFeaturesInSep (sep, NULL, RemoveFeatUser);
    }
    if (StringChr (cfp->feat, 'd') != NULL) {
      VisitFeaturesInSep (sep, NULL, RemoveFeatDbxref);
    }
    if (StringChr (cfp->feat, 'r') != NULL) {
      SeqMgrIndexFeatures (entityID, 0);
      VisitFeaturesInSep (sep, NULL, RemoveUnnecGeneXref);
    }

    aip = AsnIoOpen (path, "w");
    if (aip != NULL) {
      if (datatype == OBJ_SEQSUB) {
        SeqSubmitAsnWrite ((SeqSubmitPtr) dataptr, aip, NULL);
      } else {
        SeqEntryAsnWrite (sep, aip, NULL);
      }
      AsnIoFlush (aip);
      AsnIoClose (aip);
    }
  }

  ObjMgrFreeByEntityID (entityID);
}

/* Args structure contains command-line arguments */

#define p_argInputPath     0
#define r_argOutputPath    1
#define i_argInputFile     2
#define o_argOutputFile    3
#define f_argFilter        4
#define x_argSuffix        5
#define R_argRemote        6
#define c_argClean         7
#define l_argLink          8
#define f_argFeat          9
#define t_argTaxonLookup  10

Args myargs [] = {
  {"Path to Files", NULL, NULL, NULL,
    TRUE, 'p', ARG_STRING, 0.0, 0, NULL},
  {"Path for Results", NULL, NULL, NULL,
    TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
  {"Single Input File", "stdin", NULL, NULL,
    TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
  {"Single Output File", "stdout", NULL, NULL,
    TRUE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
  {"Substring Filter", NULL, NULL, NULL,
    TRUE, 'f', ARG_STRING, 0.0, 0, NULL},
  {"File Selection Suffix", ".ent", NULL, NULL,
    TRUE, 'x', ARG_STRING, 0.0, 0, NULL},
  {"Remote Fetching from ID", "F", NULL, NULL,
    TRUE, 'R', ARG_BOOLEAN, 0.0, 0, NULL},
  {"Cleanup\n"
   "      b BasicSeqEntryCleanup\n"
   "      s SeriousSeqEntryCleanup", NULL, NULL, NULL,
    TRUE, 'c', ARG_STRING, 0.0, 0, NULL},
  {"Link\n"
   "      o LinkCDSmRNAbyOverlap\n"
   "      p LinkCDSmRNAbyProduct\n"
   "      r ReassignFeatureIDs\n"
   "      c ClearFeatureIDs", NULL, NULL, NULL,
    TRUE, 'l', ARG_STRING, 0.0, 0, NULL},
  {"Feature\n"
   "      u Remove User Object\n"
   "      d Remove db_xref\n"
   "      r Remove Redundant Gene xref", NULL, NULL, NULL,
    TRUE, 'f', ARG_STRING, 0.0, 0, NULL},
  {"Taxonomy Lookup", "F", NULL, NULL,
    TRUE, 't', ARG_BOOLEAN, 0.0, 0, NULL},
};

Int2 Main (void)

{
  Char           app [64];
  CleanFlagData  cfd;
  CharPtr        directory, filter, infile, outfile, results, suffix;
  Boolean        remote;

  /* standard setup */

  ErrSetFatalLevel (SEV_MAX);
  ErrClearOptFlags (EO_SHOW_USERSTR);
  UseLocalAsnloadDataAndErrMsg ();
  ErrPathReset ();

  /* finish resolving internal connections in ASN.1 parse tables */

  if (! AllObjLoad ()) {
    Message (MSG_FATAL, "AllObjLoad failed");
    return 1;
  }
  if (! SubmitAsnLoad ()) {
    Message (MSG_FATAL, "SubmitAsnLoad failed");
    return 1;
  }
  if (! FeatDefSetLoad ()) {
    Message (MSG_FATAL, "FeatDefSetLoad failed");
    return 1;
  }
  if (! SeqCodeSetLoad ()) {
    Message (MSG_FATAL, "SeqCodeSetLoad failed");
    return 1;
  }
  if (! GeneticCodeTableLoad ()) {
    Message (MSG_FATAL, "GeneticCodeTableLoad failed");
    return 1;
  }

  /* process command line arguments */

  sprintf (app, "cleanasn %s", CLEANASN_APPLICATION);
  if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
    return 0;
  }

  MemSet ((Pointer) &cfd, 0, sizeof (CleanFlagData));

  directory = (CharPtr) myargs [p_argInputPath].strvalue;
  results = (CharPtr) myargs [r_argOutputPath].strvalue;
  if (StringHasNoText (results)) {
    results = directory;
  }
  infile = (CharPtr) myargs [i_argInputFile].strvalue;
  outfile = (CharPtr) myargs [o_argOutputFile].strvalue;
  filter = (CharPtr) myargs [f_argFilter].strvalue;
  suffix = (CharPtr) myargs [x_argSuffix].strvalue;

  remote = (Boolean) myargs [R_argRemote].intvalue;

  cfd.clean = myargs [c_argClean].strvalue;
  cfd.link = myargs [l_argLink].strvalue;
  cfd.feat = myargs [f_argFeat].strvalue;
  cfd.taxon = (Boolean) myargs [t_argTaxonLookup].intvalue;

  if (remote) {
#ifdef INTERNAL_NCBI_CLEANASN
    if (! PUBSEQBioseqFetchEnable ("cleanasn", FALSE)) {
      Message (MSG_POSTERR, "PUBSEQBioseqFetchEnable failed");
      return 1;
    }
#else
    PubSeqFetchEnable ();
#endif
  }

  if (StringDoesHaveText (directory)) {

    cfd.results = results;

    DirExplore (directory, NULL, suffix, FALSE, CleanupOneRecord, (Pointer) &cfd);

  } else if (StringDoesHaveText (infile) && StringDoesHaveText (outfile)) {

    cfd.outfile = outfile;

    CleanupOneRecord (infile, (Pointer) &cfd);
  }

  if (remote) {
#ifdef INTERNAL_NCBI_CLEANASN
    PUBSEQBioseqFetchDisable ();
#else
    PubSeqFetchDisable ();
#endif
  }

  return 0;
}

