# This file describes to datatool how to generate
# the autogenerated_cleanup files.
# Its syntax is described in a big comment at the
# top of traversal_spec_file_parser.cpp

# example datatool usage to get the files out of this:
# ( foo.asn is the file(s) containing the ASN spec )
# ( spec.txt is THIS file. )
# -m foo.asn -oA -tvs spec.txt

# Last known command-line used for this:
# (You would have to adjust the file names to fit your 
# directory, of course )
# datatool -pch ncbi_pch.hpp -m ../../objects/seqset/seqset.asn  -m ../../objects/general/general.asn -m ../../objects/seq/seq.asn  -m ../../objects/seqloc/seqloc.asn -m ../../objects/seqfeat/seqfeat.asn -m ../../objects/seqblock/seqblock.asn  -m ../../objects/seqalign/seqalign.asn -m ../../objects/pub/pub.asn  -m ../../objects/biblio/biblio.asn  -m ../../objects/seqres/seqres.asn -m ../../objects/seqtable/seqtable.asn -m ../../objects/medline/medline.asn -m ../../objects/submit/submit.asn -oA -tvs autogenerated_cleanup.txt

# When browsing in a text-editor, perl-mode does a reasonable
# job of highlighting this file.

# Notice the hard-coded paths in here.  It's likely that you'll want
# to change these to put the code into the right place in your own
# build directory.
output_header_file "./autogenerated_cleanup.hpp"
output_source_file "./autogenerated_cleanup.cpp"
output_class_name ncbi::objects::CAutogeneratedCleanup

# The auto-generated class needs a pointer to the class containing all the cleanup functions
member { CNewCleanup_imp & m_NewCleanup }

root Seq-entry  BasicCleanupSeqEntry
root Seq-submit BasicCleanupSeqSubmit
root Seq-annot  BasicCleanupSeqAnnot
root Bioseq     BasicCleanupBioseq
root Bioseq-set BasicCleanupBioseqSet
root Seq-feat   BasicCleanupSeqFeat

# You can use angle brackets or double-quotes
header_include "newcleanupp.hpp"
# Yes, you need the following line because the auto-generator doesn't
# necessarily know exactly how you want to include the .hpp file
source_include "autogenerated_cleanup.hpp"
source_include "cleanup_utils.hpp"
source_include "autogenerated_cleanup_extra.hpp"
source_include <objects/misc/sequence_macros.hpp>

# Some fields are deprecated and their accessor may be private, throw an
# exception, or be otherwise unusable.

deprecated {
    Variation-ref.population-data ,
    Variation-ref.validated ,
    Variation-ref.clinical-test ,
    Variation-ref.allele-origin ,
    Variation-ref.allele-state ,
    Variation-ref.allele-frequency ,
    Variation-ref.is-ancestral-allele ,
    Variation-ref.pub ,
    Variation-ref.location ,
    Variation-ref.ext-locs ,
    Variation-ref.ext
}

# Don't forget: order matters!

# Call EnteringEntry as early as possible.
# (Its partner function is LeavingEntry.  See below.)
use m_NewCleanup.EnteringEntry { Seq-entry }

# Should be before other types of cleaning, since other cleaners
# might remove the ';' at the end of XML codes
use m_NewCleanup.x_DecodeXMLMarkChanged {
    Seq-feat.comment ,
    Seq-feat.title ,
    Gene-ref.locus ,
    Gene-ref.syn.E ,
    Prot-ref.name.E ,
    Prot-ref.desc ,
    RNA-ref.ext.name ,
    Seqdesc.title ,
    Seqdesc.comment ,
}

use m_NewCleanup.x_CompressStringSpacesMarkChanged {
    Affil.str ,
    Prot-ref.name.E ,
    RNA-ref.ext.name ,
    Seq-feat.data.region ,
    Seqdesc.title ,
    Gene-ref.desc ,
}

membermacro CLEAN_STRING_MEMBER {
    Gene-ref.allele ,
    Gene-ref.desc ,
    Gene-ref.locus ,
    Gene-ref.locus-tag ,
    Gene-ref.maploc ,
    Imp-feat.descr ,
    Imp-feat.key ,
    Imp-feat.loc ,
    OrgName.attrib ,
    OrgName.div ,
    OrgName.lineage ,
    Seq-feat.comment ,
    Seq-feat.except-text ,
    Seq-feat.title ,
}

use m_NewCleanup.x_CleanupStringMarkChanged {
    Affil.str ,
    RNA-ref.ext.name ,
    Seq-feat.data.region ,
    Seqdesc.name ,
    Seqdesc.region ,
    Seqdesc.title ,
}

use m_NewCleanup.x_CleanupStringJunkMarkChanged {
    Seqdesc.comment ,
}

membermacro CLEAN_AND_COMPRESS_STRING_MEMBER {
    Affil.std.affil ,
    Affil.std.city ,
    Affil.std.country ,
    Affil.std.div ,
    Affil.std.email ,
    Affil.std.fax ,
    Affil.std.phone ,
    Affil.std.postal-code ,
    Affil.std.street ,
    Affil.std.sub ,
    Imprint.issue ,
    Imprint.language ,
    Imprint.pages ,
    Imprint.part-sup ,
    Imprint.part-supi ,
    Imprint.section ,
    Imprint.volume ,
    Org-ref.common ,
    Org-ref.taxname ,
    OrgMod.attrib ,
    RNA-gen.class ,
    RNA-gen.product ,
    RNA-gen.quals.E.qual ,
    RNA-gen.quals.E.val ,
    Seq-feat.comment ,
    Gene-ref.locus ,
}

membermacro CLEAN_AND_COMPRESS_STRING_MEMBER_JUNK {
    OrgMod.subname ,
}

use m_NewCleanup.x_ConvertDoubleQuotesMarkChanged {
    Gene-ref.allele ,
    Gene-ref.desc ,
    Gene-ref.locus ,
    Gene-ref.locus-tag ,
    Gene-ref.maploc ,
    Gene-ref.syn.E ,
    Prot-ref.name.E ,
    Pubdesc.comment ,
    RNA-gen.class ,
    RNA-gen.product ,
    RNA-gen.quals.E.qual ,
    RNA-gen.quals.E.val ,
    RNA-ref.ext.name ,
    Seq-feat.comment ,
    Seq-feat.data.region ,
    Seqdesc.region ,
}

use m_NewCleanup.x_TrimInternalSemicolonsMarkChanged {
    Org-ref.common ,
    Gene-ref.locus ,
    Seq-feat.data.region ,
    Prot-ref.desc ,
}
 
# These are POST in case substrings will be cleaned some way first
membermacro CLEAN_STRING_LIST {
    POST Gene-ref.syn ,
    POST Org-ref.syn ,
    POST Prot-ref.name ,
}

use m_NewCleanup.x_StripSpacesMarkChanged {
    Cit-art.title.E.name
}

use m_NewCleanup.x_RemoveFlankingQuotes {
    RNA-ref.ext.name ,
    RNA-gen.product ,
    Prot-ref.name.E ,
}

use m_NewCleanup.x_FixUpEllipsis {
    Seq-feat.comment
}

use m_NewCleanup.x_RemoveSpacesBetweenTildesMarkChanged {
    Seqdesc.comment
}

use m_NewCleanup.X_CommentTildeFixes {
    Seqdesc.comment
}

use m_NewCleanup.SeqIdBC { Seq-id }

use m_NewCleanup.SeqLocBC { Seq-loc }
use m_NewCleanup.x_BothStrandBC { 
    Seq-loc EXCEPT {
        Seq-feat.location ,
    }
}
use m_NewCleanup.ConvertSeqLocWholeToInt {
    Seq-feat.location 
}
use m_NewCleanup.SeqLocMixBC { Seq-loc.mix }

use m_NewCleanup.SeqsetBC { Bioseq-set }

use m_NewCleanup.ProtSeqBC { Bioseq }

# This really works on the Imp-feat inside the Seq-feat, if any.
# It's here so we convert Seq-feat's imp to another type (e.g. cdregion)
# if necessary.  And, we put it before SeqfeatBC, etc. so it happens
# as soon as possible.
use m_NewCleanup.ImpFeatBC { Seq-feat }

use m_NewCleanup.SeqfeatBC { Seq-feat }
use m_NewCleanup.x_PostSeqFeat { POST Seq-feat }

use m_NewCleanup.ProtNameBC { Prot-ref.name.E }
use m_NewCleanup.ProtActivityBC { Prot-ref.activity.E }

# Seq-feat data stuff
use m_NewCleanup.GeneFeatBC { Seq-feat.data.gene AND Seq-feat }
use m_NewCleanup.GenerefBC { POST Gene-ref }
use m_NewCleanup.ProtFeatfBC { Seq-feat.data.prot AND Seq-feat }
use m_NewCleanup.PostProtFeatfBC { POST Seq-feat.data.prot }
use m_NewCleanup.ProtrefBC { Prot-ref }
use m_NewCleanup.RnaFeatBC { POST Seq-feat.data.rna AND Seq-feat }
use m_NewCleanup.RnarefBC { POST RNA-ref }
use m_NewCleanup.PubdescBC { POST Pubdesc }
use m_NewCleanup.SiteFeatBC { Seq-feat.data.site AND Seq-feat }
use m_NewCleanup.CdregionFeatBC { Seq-feat.data.cdregion AND Seq-feat }

use m_NewCleanup.GBblockOriginBC { GB-block.origin }
use m_NewCleanup.GBblockBC { GB-block }
use m_NewCleanup.EMBLblockBC { EMBL-block }

# Biosource stuff
use m_NewCleanup.x_SubSourceBC { SubSource }
use m_NewCleanup.x_OrgModBC { OrgMod }
use m_NewCleanup.BiosourceFeatBC { POST Seq-feat.data.biosrc AND Seq-feat }
use m_NewCleanup.BiosourceBC { BioSource }
use m_NewCleanup.x_PostBiosource { POST BioSource }
use m_NewCleanup.x_ModernizePCRPrimers { POST BioSource }
use m_NewCleanup.PCRReactionSetBC { BioSource.pcr-primers }
use m_NewCleanup.x_CleanupOrgModAndSubSourceOther { BioSource.org.orgname AND BioSource }

use m_NewCleanup.x_CopyGBBlockDivToOrgnameDiv { Seq-entry }

use m_NewCleanup.x_ExpandCombinedQuals { Seq-feat.qual }

use m_NewCleanup.OrgrefModBC { Org-ref.mod.E }
use m_NewCleanup.OrgrefBC { Org-ref }
use m_NewCleanup.x_PostOrgRef { POST Org-ref }

use m_NewCleanup.x_GBQualToOrgRef {  
    Seq-feat.data.org AND Seq-feat ,
    Seq-feat.data.biosrc.org AND Seq-feat
}

use m_NewCleanup.x_DateStdBC { Date-std }

use m_NewCleanup.DbtagBC { Seq-feat.dbxref.E, Org-ref.db.E }

use m_NewCleanup.DeltaExtBC { Seq-inst.ext.delta AND Seq-inst }

use m_NewCleanup.MolInfoBC { MolInfo }
use m_NewCleanup.x_FixUnsetMolFromBiomol{ Bioseq.descr.E.molinfo AND Bioseq }

use m_NewCleanup.x_AddPartialToProteinTitle { POST Bioseq }

use m_NewCleanup.UserObjectBC { 
    Seq-feat.ext ,
    Seq-feat.data.user ,
    Seqdesc.user
}

use m_NewCleanup.x_CleanupECNumberList { Prot-ref.ec }

# While we're traversing the hierarchy AND we record any
# muid to pmid mappings we can determine AND 
# and other information
use m_NewCleanup.x_NotePubdescOrAnnotPubs {
    POST Seqdesc.pub.pub ,
    POST Seq-feat.data.pub.pub
}

# We remember all the muid pubs that might have to be converted to 
# pmid later (but we can't look at them until we finish the traversal)
use m_NewCleanup.x_RememberMuidThatMightBeConvertibleToPmid {
    POST Seq-feat.cit.pub.E.muid AND Seq-feat.cit.pub.E
}

# Remember all Seq-feat 
use m_NewCleanup.x_RememberSeqFeatCitPubs {
    POST Seq-feat.cit.pub.E
}

# Needed because PubdescBC is "POST" so it can't create
# the *initial* label of the Pub
use m_NewCleanup.x_RememberPubOldLabel { Pub }

use m_NewCleanup.x_AuthListBCWithFixInitials {
    Seq-submit.sub.cit.authors
}

#remove empty descr set after other cleaning steps are done
use m_NewCleanup.x_ClearEmptyDescr {
    POST Bioseq-set,
    POST Bioseq
}

#remove single-strandedness from non-viral nucleotide sequences
use m_NewCleanup.x_RemoveSingleStrand {
    Bioseq
}

# Call LeavingEntry as late as possible, so keep this as close 
# as possible to end of file.
# (Its partner function is EnteringEntry.  See above.)
use m_NewCleanup.LeavingEntry { POST Seq-entry }
