/*
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* Author: Robert Smith, Jonathan Kans, Michael Kornbluh
*
* File Description:
*   Basic and Extended Cleanup of CSeq_entries, etc.
*
* ===========================================================================
*/

// All this functionality is packed into this one file for ease of 
// searching.  If it gets big enough, it will be broken up in the future.

#include <ncbi_pch.hpp>

#include <corelib/ncbi_autoinit.hpp>

#include <objects/misc/sequence_macros.hpp>
#include <objmgr/annot_ci.hpp>
#include <objmgr/feat_ci.hpp>
#include <objmgr/seqdesc_ci.hpp>
#include <objmgr/scope.hpp>
#include <objmgr/seq_vector.hpp>
#include <objmgr/util/seq_loc_util.hpp>
#include <objmgr/util/feature.hpp>
#include <objmgr/util/sequence.hpp>
#include <objtools/cleanup/cleanup_change.hpp>

#include <objtools/cleanup/cleanup.hpp>
#include "newcleanupp.hpp"

#include "cleanup_utils.hpp"

#include <objtools/cleanup/cleanup_pub.hpp>
#include <objtools/cleanup/fix_feature_id.hpp>
#include <objtools/readers/read_util.hpp>

#include <objmgr/bioseq_ci.hpp>
#include <objmgr/object_manager.hpp>
#include <objmgr/scope.hpp>

#include <objects/medline/Medline_entry.hpp>
#include <objects/valid/Comment_rule.hpp>
#include <objects/valid/Comment_set.hpp>

#include <util/ncbi_cache.hpp>
#include <util/sequtil/sequtil_convert.hpp>
#include <util/sequtil/sequtil_manip.hpp>
#include <util/xregexp/regexp.hpp>
#include <util/strsearch.hpp>

#include <objmgr/util/objutil.hpp>

#include "autogenerated_cleanup.hpp"
#include "autogenerated_extended_cleanup.hpp"

BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)

const int CNewCleanup_imp::NCBI_CLEANUP_VERSION = 1;

// We don't want to use CompressSpaces inside the likes of COMPRESS_STRING_MEMBER
// we prefer our own version
#define CompressSpaces x_CompressSpaces

namespace {

    // a CRegexp that has lock and unlock methods,
    // and also inherits from CObject   
    class CRegexpWithLock : public CRegexp, public CObject {
    public:
        CRegexpWithLock( const CTempStringEx & pattern, 
            CRegexp::TCompile flags ) : CRegexp(pattern, flags) { }

        void Lock(void) { m_mutex.Lock(); }
        void Unlock(void) { m_mutex.Unlock(); }

    private:
        CMutex               m_mutex;
    };
    typedef CRef<CRegexpWithLock> TRegexpWithLockRef;

    // this protects its inner object by locking
    // it as soon as it's created and unlocking it when destroyed.
    // this way, there's only one working CLockingRef on the object at a time
    template<typename TLockableObj>
    class CLockingRef {
    public:
        explicit 
        CLockingRef(TLockableObj *pLockableObj) :
        m_pLockableObj(pLockableObj) 
        {
            m_pLockableObj->Lock();
        }

        ~CLockingRef(void) { 
            m_pLockableObj->Unlock();
        }

        TLockableObj * operator->(void) { return m_pLockableObj.GetPointer(); }

    private:
        CRef<TLockableObj> m_pLockableObj;
    };
    typedef CLockingRef<CRegexpWithLock> CCachedRegexp;

    // careful! the key is compared as a *pointer*, NOT via
    // strcmp or anything like that.  For safety, just use
    // string literals.
    typedef pair<const char *, CRegexp::TCompile> TRegexpKey;
    typedef TRegexpWithLockRef TRegexpValue;
    
    class CRegexpCacheHandler : 
        public CCacheElement_Handler<TRegexpKey, TRegexpValue>
    {
    public:
        TRegexpValue CreateValue(const TRegexpKey & regexp_key )
        {
            return Ref(new CRegexpWithLock(
                regexp_key.first, regexp_key.second));
        }
    };
    
    class CRegexpCache {
    public:

        CRegexpCache(void)
            : m_Cache(100) { }

        CCachedRegexp Get( const char * pattern, 
            CRegexp::TCompile flags = CRegexp::fCompile_default )
        {
            TRegexpKey regexpKey(pattern, flags);
            TRegexpWithLockRef regexpLockRef = m_Cache[regexpKey];
            return CCachedRegexp(regexpLockRef.GetPointer());
        }

    private:
        typedef CCache<TRegexpKey, TRegexpValue,
            CRegexpCacheHandler> TUnderlyingCache;
        TUnderlyingCache m_Cache;
    };

    // the actual cache
    CRegexpCache regexpCache;
}

// Constructor
CNewCleanup_imp::CNewCleanup_imp (CRef<CCleanupChange> changes, Uint4 options)
    : m_Changes(changes),
      m_Options(options),
      m_Objmgr(NULL),
      m_Scope(NULL),
      m_IsGpipe(false),
      m_SyncGenCodes(false),
      m_StripSerial(true),
      m_IsEmblOrDdbj(false),
      m_KeepTopNestedSet(false)
{
    if (options & CCleanup::eClean_GpipeMode) {
        m_IsGpipe = true;
    }

    if (options & CCleanup::eClean_SyncGenCodes) {
        m_SyncGenCodes = true;
    }

    if (options & CCleanup::eClean_KeepTopSet) {
        m_KeepTopNestedSet = true;
    }

    m_Objmgr = CObjectManager::GetInstance ();
    m_Scope.Reset (new CScope (*m_Objmgr));

}

// Destructor
CNewCleanup_imp::~CNewCleanup_imp (void)

{
}

// Main methods

void CNewCleanup_imp::BasicCleanupSeqEntry (
    CSeq_entry& se
)

{
    // The class CAutogeneratedCleanup is actually auto-generated code
    // created by datatool from autogenerated_cleanup.txt
    // It traverses into the CSeq_entry object we have here and
    // calls our functions here.
    // The idea is that we don't have to hand-write the
    // error-prone traversal code.
    SetGlobalFlags(se);
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqEntry( se );
    x_PostProcessing();

    EXPLORE_ALL_BIOSEQS_WITHIN_SEQENTRY (bit, se) {
        CBioseq& bs = *bit;
        SetGeneticCode (bs);
    }
}

//LCOV_EXCL_START
//not used by asn_cleanup because we clean the submit block separately
//and use read hooks for the seq-entries
void CNewCleanup_imp::BasicCleanupSeqSubmit (
    CSeq_submit& ss
)

{
    SetGlobalFlags(ss);
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqSubmit( ss );
    x_PostProcessing();

    CRef<CSeq_entry> se (ss.SetData().SetEntrys().front());
    if (se.NotEmpty()) {
        EXPLORE_ALL_BIOSEQS_WITHIN_SEQENTRY (bit, *se) {
            CBioseq& bs = *bit;
            SetGeneticCode (bs);
        }
    }
}
//LCOV_EXCL_STOP


void CNewCleanup_imp::BasicCleanupSubmitblock(CSubmit_block& block)

{
    SubmitblockBC(block);
}


void CNewCleanup_imp::BasicCleanupSeqAnnot (
    CSeq_annot& sa
)

{
    // no Seq-entry context, so skip setup function
    ResetGlobalFlags();
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqAnnot( sa );
    x_PostProcessing();
}

void CNewCleanup_imp::BasicCleanupBioseq(
    CBioseq& bs
)

{
    // no Seq-entry context, so skip setup function
    SetGlobalFlags(bs);
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupBioseq( bs );

    x_PostProcessing();

    SetGeneticCode (bs);
}

void CNewCleanup_imp::BasicCleanupBioseqSet (
    CBioseq_set& bss
)

{
    // no Seq-entry context, so skip setup function
    SetGlobalFlags(bss);
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupBioseqSet( bss );
    x_PostProcessing();

    EXPLORE_ALL_BIOSEQS_WITHIN_SEQSET (bit, bss) {
        CBioseq& bs = *bit;
        SetGeneticCode (bs);
    }
}

void CNewCleanup_imp::BasicCleanupSeqFeat (
    CSeq_feat& sf
)

{
    // no Seq-entry context, so skip setup function
    ResetGlobalFlags();
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    auto_cleanup.BasicCleanupSeqFeat( sf );
    x_PostProcessing();
}


void CNewCleanup_imp::BasicCleanupBioSource (
    CBioSource& src
)

{
    // no Seq-entry context, so skip setup function
    ResetGlobalFlags();
    CAutogeneratedCleanup auto_cleanup( *m_Scope, *this );
    CRef<CSeq_feat> f(new CSeq_feat());
    f->SetData().SetBiosrc().Assign(src);
    auto_cleanup.BasicCleanupSeqFeat(*f);
    x_PostProcessing();
    src.Assign(f->GetData().GetBiosrc());
}


void CNewCleanup_imp::ExtendedCleanup(CBioSource& src)
{
    BiosourceBC(src);
    BioSourceEC(src);
}


void CNewCleanup_imp::BasicCleanupSeqEntryHandle (
    CSeq_entry_Handle& seh
)
{    
    CConstRef<CSeq_entry> seq_entry = seh.GetCompleteSeq_entry();
    CSeq_entry* se = const_cast<CSeq_entry*>(seq_entry.GetPointer());
    BasicCleanupSeqEntry(*se);
}

void CNewCleanup_imp::BasicCleanupBioseqHandle (
    CBioseq_Handle& bsh
)
{
    // clean a copy, and then update via the edit handle

    CRef<CBioseq> new_bioseq( new CBioseq );
    new_bioseq->Assign( *bsh.GetCompleteBioseq() );

    CBioseq_EditHandle edit_handle = bsh.GetEditHandle();

    BasicCleanupBioseq( *new_bioseq );

    // get each part from the copy

    edit_handle.ResetId();
    FOR_EACH_SEQID_ON_BIOSEQ( seq_id_iter, *new_bioseq ) {
        edit_handle.AddId( CSeq_id_Handle::GetHandle(**seq_id_iter) );
    }

    edit_handle.ResetDescr();
    if( new_bioseq->IsSetDescr() ) {
        edit_handle.SetDescr( new_bioseq->SetDescr() );
    }

    edit_handle.SetInst( new_bioseq->SetInst() );
    while( ! RAW_FIELD_IS_EMPTY_OR_UNSET( *bsh.GetCompleteBioseq(), Annot ) )  {
        CSeq_annot_CI annot_ci( bsh );
        CSeq_annot_EditHandle a = annot_ci->GetEditHandle();
        a.Remove();
    }
    EDIT_EACH_SEQANNOT_ON_BIOSEQ( annot_iter, *new_bioseq ) {
        edit_handle.AttachAnnot( **annot_iter );
    }
}

void CNewCleanup_imp::BasicCleanupBioseqSetHandle (
    CBioseq_set_Handle& bssh
)
{
    // clean a copy, and then update via the edit handle

    CRef<CBioseq_set> new_bioseq_set( new CBioseq_set );
    new_bioseq_set->Assign( *bssh.GetCompleteBioseq_set() );

    CBioseq_set_EditHandle edit_handle = bssh.GetEditHandle();

    BasicCleanupBioseqSet( *new_bioseq_set );

    // get each part from the copy

#define BC_COPY_FIELD(Fld) \
    edit_handle.Reset##Fld(); \
    if( new_bioseq_set->IsSet##Fld() ) { \
        edit_handle.Set##Fld( new_bioseq_set->Set##Fld() ); \
    }

    BC_COPY_FIELD(Id);
    BC_COPY_FIELD(Coll);
    BC_COPY_FIELD(Level);
    BC_COPY_FIELD(Class);
    BC_COPY_FIELD(Release);
    BC_COPY_FIELD(Date);
    BC_COPY_FIELD(Descr);

#undef BC_COPY_FIELD

    while( ! RAW_FIELD_IS_EMPTY_OR_UNSET( *bssh.GetCompleteBioseq_set(), Seq_set ) )  {
        CSeq_entry_CI entry_ci( bssh );
        CSeq_entry_EditHandle edit = entry_ci->GetEditHandle();
        edit.Remove();
    }
    EDIT_EACH_SEQENTRY_ON_SEQSET( entry_iter, *new_bioseq_set ) {
        edit_handle.AttachEntry( **entry_iter );
    }

    // copy annot field
    while( ! RAW_FIELD_IS_EMPTY_OR_UNSET( *bssh.GetCompleteBioseq_set(), Annot ) )  {
        CSeq_annot_CI annot_ci( bssh );
        CSeq_annot_EditHandle edit = annot_ci->GetEditHandle();
        edit.Remove();
    }
    EDIT_EACH_SEQANNOT_ON_SEQSET( annot_iter, *new_bioseq_set ) {
        edit_handle.AttachAnnot( **annot_iter );
    }
}

void CNewCleanup_imp::BasicCleanupSeqAnnotHandle (
    CSeq_annot_Handle& sah
)
{
    // clean a copy, and then update via the edit handle

    CRef<CSeq_annot> new_seq_annot( new CSeq_annot );
    new_seq_annot->Assign( *sah.GetCompleteSeq_annot() );

    CSeq_annot_EditHandle edit_handle = sah.GetEditHandle();

    BasicCleanupSeqAnnot( *new_seq_annot );

    // Since CSeq_annot_EditHandle doesn't have ".Set[Fld]()" methods or
    // a Replace() method, it's a little more tricky than the others.
    CSeq_entry_EditHandle annot_parent = edit_handle.GetParentEntry();
    if( annot_parent ) {
        edit_handle.Remove();
        sah = annot_parent.AttachAnnot( *new_seq_annot );
    } else {
        // if not part of anything else, a simple swap will do
        CSeq_annot_Handle new_sah = m_Scope->AddSeq_annot( *new_seq_annot );
        edit_handle.Swap( new_sah );
    }
}

void CNewCleanup_imp::BasicCleanupSeqFeatHandle (
    CSeq_feat_Handle& sfh
)
{
    // clean a copy, and then update via the edit handle

    CRef<CSeq_feat> new_seq_feat( new CSeq_feat );
    new_seq_feat->Assign( *sfh.GetOriginalSeq_feat() );

    CSeq_feat_EditHandle edit_handle( sfh );

    BasicCleanupSeqFeat( *new_seq_feat );

    edit_handle.Replace( *new_seq_feat );
}


void CNewCleanup_imp::BasicCleanup(CPubdesc& pd, bool strip_serial)
{
    bool was_strip_serial = m_StripSerial;
    m_StripSerial = strip_serial;
    PubdescBC(pd);
    m_StripSerial = was_strip_serial;
}


void CNewCleanup_imp::BasicCleanup(CSeqdesc& desc)
{
    ResetGlobalFlags();
    CAutogeneratedCleanup auto_cleanup(*m_Scope, *this);
    auto_cleanup.BasicCleanupSeqdesc(desc);
    x_PostProcessing();
}




// Implementation methods

void CNewCleanup_imp::SetGeneticCode (
    CBioseq& bs
)

{
    if ( ! m_SyncGenCodes ) return;

    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(bs);
    if (!bsh) return;

    if (CCleanup::SetGeneticCodes(bsh)) {
        ChangeMade(CCleanupChange::eChangeGeneticCode);
    }
}

void CNewCleanup_imp::ChangeMade (CCleanupChange::EChanges e)
{
    if (m_Changes) {
        m_Changes->SetChanged (e);
    }
}

void CNewCleanup_imp::EnteringEntry (
    CSeq_entry& se
)

{
#if 0
    SSeqEntryInfo seqEntryInfo;
    if( ! m_SeqEntryInfoStack.empty() ) {
        // inherit from parent by default
        seqEntryInfo = m_SeqEntryInfoStack.top();
    } else {
        seqEntryInfo.m_IsEmblOrDdbj = false;
        seqEntryInfo.m_StripSerial = true;
    }
#endif

    // for cleanup Seq-entry and Seq-submit, set scope and parentize.
    // We use exceptions for AddTopLevelSeqEntry because we need to detect
    // if we've already processed the given Seq-entry.
    {{
         CSeq_entry_Handle seh =
             m_Scope->GetSeq_entryHandle(se, CScope::eMissing_Null);
         if (seh) {
#if 0
             // all code paths in this function must result
             // in m_SeqEntryInfoStack getting a "push"
             m_SeqEntryInfoStack.push( m_SeqEntryInfoStack.top() );
#endif
             return;
         }

         m_Scope->AddTopLevelSeqEntry (se);
         se.Parentize();
     }}

#if 0
    // a few differences based on sequence identifier type
    // (some values are reset here because they shouldn't inherit
    // from higher seq-entry's)
    VISIT_ALL_BIOSEQS_WITHIN_SEQENTRY (bs_itr, se) {
        const CBioseq& bs = *bs_itr;
        FOR_EACH_SEQID_ON_BIOSEQ (sid_itr, bs) {
            const CSeq_id& sid = **sid_itr;
            SWITCH_ON_SEQID_CHOICE (sid) {
                case NCBI_SEQID(Genbank):
                case NCBI_SEQID(Tpg):
                    {
                        const CTextseq_id& tsid = *GET_FIELD (sid, Textseq_Id);
                        if (FIELD_IS_SET (tsid, Accession)) {
                            const string& acc = GET_FIELD (tsid, Accession);
                            if (acc.length() == 6) {
                                seqEntryInfo.m_StripSerial = false;
                            }
                        }
                    }
                    break;
                case NCBI_SEQID(Embl):
                case NCBI_SEQID(Ddbj):
                    seqEntryInfo.m_StripSerial = false;
                    seqEntryInfo.m_IsEmblOrDdbj = true;
                    break;
                case NCBI_SEQID(not_set):
                case NCBI_SEQID(Local):
                case NCBI_SEQID(Other):
                case NCBI_SEQID(General):
                    break;
                case NCBI_SEQID(Gibbsq):
                case NCBI_SEQID(Gibbmt):
                case NCBI_SEQID(Pir): 
                case NCBI_SEQID(Swissprot):
                case NCBI_SEQID(Patent):
                case NCBI_SEQID(Prf):
                case NCBI_SEQID(Pdb):
                case NCBI_SEQID(Gpipe):
                case NCBI_SEQID(Tpe):
                case NCBI_SEQID(Tpd):
                    seqEntryInfo.m_StripSerial = false;
                    break;
                default:
                    break;
            }
        }
    }

    m_SeqEntryInfoStack.push(seqEntryInfo);
#endif
}

void CNewCleanup_imp::LeavingEntry (
    CSeq_entry& se
)

{
#if 0
    m_SeqEntryInfoStack.pop();
#endif
}

// Strips all spaces in string in following manner. If the function
// meets several spaces (spaces and tabs) in succession it replaces them
// with one space. Strips all spaces after '(' and before ( ')' or ',' ).
void CNewCleanup_imp::x_StripSpacesMarkChanged(string& str)
{
    if (StripSpaces(str)) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::x_RemoveSpacesBetweenTildesMarkChanged( std::string & str )
{
    if( RemoveSpacesBetweenTildes(str) ) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::X_CommentTildeFixes(std::string & str)
{
/*
#ifndef NCBI_OS_MSWIN
    string orig = str;
    NStr::ReplaceInPlace(str, "based on SOLiD3 (Applied Biosystems)~~", "based on SOLiD3 (Applied Biosystems)", false, false);
    NStr::ReplaceInPlace(str, "Biological resourse center, NITE (NRBC)~~", "Biological resourse center, NITE (NRBC)", false, false);
    NStr::ReplaceInPlace(str, "developmental01.html~~", "developmental01.html", false, false);
    NStr::ReplaceInPlace(str, "http://bionano.toyo.ac.jp/~~", "http://bionano.toyo.ac.jp/", false, false);
    NStr::ReplaceInPlace(str, "http://dictycdb1.biol.tsukuba.ac.jp/acytodb/~~", "http://dictycdb1.biol.tsukuba.ac.jp/acytodb/", false, false);
    NStr::ReplaceInPlace(str, "http://egg.umh.es~~", "http://egg.umh.es", false, false);
    NStr::ReplaceInPlace(str, "http://www.aist.go.jp/~~", "http://www.aist.go.jp/", false, false);
    NStr::ReplaceInPlace(str, "http://www.bio.nite.go.jp/~~", "http://www.bio.nite.go.jp/", false, false);
    NStr::ReplaceInPlace(str, "http://www.bio.nite.go.jp/ngac/e/~~", "http://www.bio.nite.go.jp/ngac/e/", false, false);
    NStr::ReplaceInPlace(str, "http://www.brs.kyushu-u.ac.jp/~fcmic/~~", "http://www.brs.kyushu-u.ac.jp/~fcmic/", false, false);
    NStr::ReplaceInPlace(str, "http://www.miyazaki-u.ac.jp/ir/english/index.html~~", "http://www.miyazaki-u.ac.jp/ir/english/index.html", false, false);
    NStr::ReplaceInPlace(str, "URL:http://www.bio.nite.go.jp/ngac/e/~~", "URL:http://www.bio.nite.go.jp/ngac/e/", false, false);
    if (!NStr::Equal(orig, str)) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }
#endif //NCBI_OS_MSWIN
*/
}

void CNewCleanup_imp::x_TruncateSpacesMarkChanged( std::string & str )
{
    const size_t old_str_size = str.length();
    NStr::TruncateSpacesInPlace(str);
    if( old_str_size != str.length() ) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::x_TrimInternalSemicolonsMarkChanged( std::string & str )
{
    const size_t old_str_size = str.length();
    TrimInternalSemicolons(str);
    if( old_str_size != str.length() ) {
        ChangeMade(CCleanupChange::eTrimInternalSemicolons);
    }
}

void CNewCleanup_imp::SeqsetBC (
    CBioseq_set& bss
)

{
    if( ! FIELD_IS_SET(bss, Class) || 
        GET_FIELD(bss, Class) == CBioseq_set::eClass_not_set || 
        GET_FIELD(bss, Class) == CBioseq_set::eClass_other ) 
    { 
        int num_nucs = 0;
        int num_prots = 0;
        bool make_genbank = false;
        CBioseq_set_Handle handle = m_Scope->GetBioseq_setHandle( bss );
        if( handle ) {
            CBioseq_CI bioseq_it( handle, CSeq_inst::eMol_not_set, CBioseq_CI::eLevel_Mains );
            for( ; bioseq_it ; ++bioseq_it ) {
                if( bioseq_it->IsAa() ) {
                    ++num_prots;
                } else if( bioseq_it->IsNa() ) {
                    ++num_nucs;
                }
            }

            // Iterate descendent Bioseq_set's.
            // Since there seems to be no such thing as CBioseq_set_CI,
            // we iterate over the Seq-entry's since every Bioseq-set should
            // be guaranteed to be in a Seq-entry.
            CSeq_entry_CI seq_entry_ci( handle );
            for( ; seq_entry_ci; ++seq_entry_ci ) {
                if( seq_entry_ci->IsSet() ) {
                    CBioseq_set_Handle bioseq_set = seq_entry_ci->GetSet();
                    if( ! FIELD_EQUALS(bioseq_set, Class, NCBI_BIOSEQSETCLASS(segset)) && 
                        ! FIELD_EQUALS(bioseq_set, Class, NCBI_BIOSEQSETCLASS(parts)) ) 
                    {
                        make_genbank = true;
                    }
                }
            }
            // separate check needed for top level due to the somewhat kludgy way
            // we iterate over CBioseq-sets
            if( ! FIELD_EQUALS(handle, Class, NCBI_BIOSEQSETCLASS(segset)) && 
                ! FIELD_EQUALS(handle, Class, NCBI_BIOSEQSETCLASS(parts)) ) 
            {
                make_genbank = true;
            }
        }

        if( (num_nucs == 1) && (num_prots > 0) && ! make_genbank ) {
            bss.SetClass( CBioseq_set::eClass_nuc_prot );
            ChangeMade(CCleanupChange::eChangeBioseqSetClass);
        } else {
            bss.SetClass( CBioseq_set::eClass_genbank );
            ChangeMade(CCleanupChange::eChangeBioseqSetClass);
        }
    }
}

static CMolInfo::TCompleteness GetCompletenessFromFlags(bool partial5, bool partial3, bool partial)
{
    CMolInfo::TCompleteness comp = CMolInfo::eCompleteness_complete;
    if (partial5 && partial3) {
        comp = CMolInfo::eCompleteness_no_ends;
    } else if (partial5) {
        comp = CMolInfo::eCompleteness_no_left;
    } else if (partial3) {
        comp = CMolInfo::eCompleteness_no_right;
    } else if (partial) {
        comp = CMolInfo::eCompleteness_partial;
    }
    return comp;
}

void CNewCleanup_imp::ProtSeqBC (CBioseq& bs)
{
    // Bail if not protein
    if (!bs.IsSetInst()) {
        return;
    }
    CSeq_inst& inst = bs.SetInst();
    if (!inst.IsSetMol() || inst.GetMol() != CSeq_inst::eMol_aa) {
        return;
    }

    if (bs.GetInst().IsSetTopology() && bs.GetInst().GetTopology() == CSeq_inst::eTopology_linear) {
        bs.SetInst().ResetTopology();
        ChangeMade(CCleanupChange::eChangeBioseqInst);
    }

    // Bail if no GIBBSQ ID
    if (!bs.IsSetId()) {
        return;
    }
    bool has_gibbsq = false;
    ITERATE(CBioseq::TId, id, bs.GetId()) {
       if ((*id)->IsGibbsq()) {
           has_gibbsq = true;
           break;
       }
    }
    if (!has_gibbsq) {
        return;
    }

    // Bail if no title or no partialness clues in title
    if (!bs.IsSetDescr()) {
        return;
    }
    bool make_partial5 = false;
    bool make_partial3 = false;
    for (auto dit : bs.GetDescr().Get()) {
        if (dit->IsTitle()) {
            if (NStr::Find(dit->GetTitle(), "{C-terminal}") != string::npos) {
                make_partial5 = true;
            }
            if (NStr::Find(dit->GetTitle(), "{N-terminal}") != string::npos) {
                make_partial3 = true;
            }
            break;
        }
    }

    if (!make_partial5 && !make_partial3) {
        return;
    }

    // Bail if no protein feature with missing partials
    if (!bs.IsSetAnnot()) {
        return;
    }
    for (auto ait : bs.SetAnnot()) {
        if (ait->IsSetData() && ait->GetData().IsFtable()) {
            for (auto fi : ait->SetData().SetFtable()) {
                if (fi->IsSetData() && 
                    fi->GetData().GetSubtype() == CSeqFeatData::eSubtype_prot &&
                    fi->IsSetPartial() && fi->GetPartial() && 
                    fi->IsSetLocation() && 
                    !fi->GetLocation().IsPartialStart(eExtreme_Biological) &&
                    !fi->GetLocation().IsPartialStop(eExtreme_Biological)) {
                    // note - we are only fixing partials if *both*
                    // ends were left as complete. One end being
                    // set as partial means that someone was doing this
                    // deliberately.
                    if (make_partial5) {
                        fi->SetLocation().SetPartialStart(true, eExtreme_Biological);
                    }
                    if (make_partial3) {
                        fi->SetLocation().SetPartialStop(true, eExtreme_Biological);
                    }
                    ChangeMade(CCleanupChange::eChangeSeqloc);

                    CMolInfo::TCompleteness wanted = GetCompletenessFromFlags(make_partial5, make_partial3, true);
                    for (auto ds : bs.SetDescr().Set()) {
                        if (ds->IsMolinfo() &&
                            (!ds->GetMolinfo().IsSetCompleteness() ||
                                ds->GetMolinfo().GetCompleteness() != wanted)) {
                            ds->SetMolinfo().SetCompleteness(wanted);
                            ChangeMade(CCleanupChange::eChangeMolInfo);
                            break;
                        }
                    }
                }
            }
        }
    }
}


void CNewCleanup_imp::SeqIdBC( CSeq_id &seq_id )
{
    // try to find CObject_id in Seq-id for certain types
    CRef<CObject_id> pObjectId;
    if( seq_id.IsLocal() ) {
        pObjectId.Reset( & GET_MUTABLE(seq_id, Local) );
    }

    // currently, we only process the Str ones
    if( ! pObjectId || ! FIELD_IS(*pObjectId, Str) ) {
        return;
    }

    x_TruncateSpacesMarkChanged( GET_MUTABLE(*pObjectId, Str) );
}

// change the target string by searching for the given search_pattern
// and replacing it with replacement up to max_replace times (0 means unlimited)
//
// Example: 
//     string foo = "Test:   FOO   BAR    :BAZ."
//     s_RegexpReplace( foo, ":[ ]+", ": " );
// This turns foo into "Test: FOO   BAR    :BAZ."
// Returns "true" if a replacement was done

static const int s_RegexpReplace_UnlimitedReplacements = 0;

static
bool s_RegexpReplace( string &target, 
    const char *search_pattern, 
    const char *replacement,
    int max_replace = s_RegexpReplace_UnlimitedReplacements,
    CRegexp::ECompile compile_flags = CRegexp::fCompile_default )
{
    CRegexpUtil replacer( target );
    size_t num_replacements = replacer.Replace( search_pattern, replacement, 
        compile_flags, CRegexp::fMatch_default, max_replace );
    // swap is faster than assignment
    replacer.GetResult().swap( target ); 

    return ( num_replacements > 0 );
}

// This is similar to lexicographical_compare_3way,
// but we have to implement it ourselves because
// it's an SGI extension, not in the standard.
template <class Iter1, class Iter2, class Compare>
static int ncbi_lexicographical_compare_3way( 
    Iter1 first1, Iter1 last1, 
    Iter2 first2, Iter2 last2, 
    Compare compare )
{
    for( ; first1 != last1 && first2 != last2 ; ++first1, ++first2 ) {
        int comparison = compare( *first1, *first2 );
        if( comparison != 0 ) {
            return comparison;
        }
    }

    if( first1 == last1 ) {
        if( first2 == last2 ) {
            return 0; // they're equal
        } else {
            // second is longer
            return -1;
        }
    } else {
        // first is longer
        return 1;
    }
}

class PNocase_EqualChar
{
public:
    bool operator()( const char ch1, const char ch2 ) const {
        return toupper(ch1) == toupper(ch2);
    }
};

class PNocase_LessChar
{
public:
    bool operator()( const char ch1, const char ch2 ) const {
        return toupper(ch1) < toupper(ch2);
    }
};

class PNocase_CompareChar
{
public:
    int operator()( const char ch1, const char ch2 ) const {
        return ( (int)toupper(ch1) - (int)toupper(ch2) );
    }
};

// C compares using toupper, as opposed to the built-in
// stuff which seems to use tolower, thus producing
// some differences in sorting order in some places.
// Once we've fully moved away from C there's probably
// no harm in replacing all calls to s_CompareNoCaseCStyle with
// normal functions like NStr::CompareNocase()
static
int s_CompareNoCaseCStyle( const string &s1, const string &s2 ) 
{
    return ncbi_lexicographical_compare_3way(
            s1.begin(), s1.end(), 
            s2.begin(), s2.end(), 
            PNocase_CompareChar() );
}

static
const string &s_GenomeToPlastidName( const CBioSource& biosrc )
{
    SWITCH_ON_BIOSOURCE_GENOME (biosrc) {
    case NCBI_GENOME(apicoplast): 
        {
            const static string apicoplast("apicoplast");
            return apicoplast;
        }
        break;
    case NCBI_GENOME(chloroplast):
        {
            const static string chloroplast("chloroplast");
            return chloroplast;
        }
        break;
    case NCBI_GENOME(chromoplast):
        {
            const static string chromoplast("chromoplast");
            return chromoplast;
        }
        break;
    case NCBI_GENOME(kinetoplast):
        {
            const static string kinetoplast("kinetoplast");
            return kinetoplast;
        }
        break;
    case NCBI_GENOME(leucoplast):
        {
            const static string leucoplast("leucoplast");
            return leucoplast;
        }
        break;
    case NCBI_GENOME(plastid):
        {
            const static string plastid("plastid");
            return plastid;
        }
        break;
    case NCBI_GENOME(proplastid):
        {
            const static string proplastid("proplastid");
            return proplastid;
        }
        break;
    default:
        return kEmptyStr;
        break;
    }
    return kEmptyStr;
}

// If str starts with prefix, the prefix is removed from the string.
static
bool s_RemoveInitial( string &str, const string &prefix, NStr::ECase case_to_use )
{
    if( NStr::StartsWith( str, prefix, case_to_use ) ) {
        str.erase( 0, prefix.length() );
        return true;
    }
    return false;
}

// Given the position of the opening paren in a string, this returns
// the position of the closing paren (keeping track of any nested parens
// in the middle.
// It returns NPOS if the paren is not closed.
// This function is not currently smart; it doesn't know about quotes
// or anything
static
SIZE_TYPE s_MatchingParenPos( const string &str, SIZE_TYPE open_paren_pos )
{
    _ASSERT( str[open_paren_pos] == '(' );
    _ASSERT( open_paren_pos < str.length() );

    // nesting level. start at 1 since we know there's an open paren
    int level = 1;

    SIZE_TYPE pos = open_paren_pos + 1;
    for( ; pos < str.length(); ++pos ) {
        switch( str[pos] ) {
            case '(':
                // nesting deeper
                ++level;
                break;
            case ')':
                // closed a level of nesting
                --level;
                if( 0 == level ) {
                    // reached the top: we're closing the initial paren,
                    // so we return our position
                    return pos;
                }
                break;
            default:
                // ignore other characters.
                // maybe in the future we'll handle ignoring parens in quotes or
                // things like that.
                break;
        }
    }
    return NPOS;
}

static bool s_AccessionCompare (
    const string& str1,
    const string& str2
)

{
    return ( NStr::CompareNocase( str1, str2 ) < 0 );
}

static bool s_AccessionEqual (
    const string& str1,
    const string& str2
)

{
    if (NStr::EqualNocase (str1, str2)) return true;

    return false;
}


void CNewCleanup_imp::GBblockOriginBC ( string& str )
{
    if (CleanVisStringJunk(str)) {
        ChangeMade (CCleanupChange::eTrimSpaces);
    }
}


void CNewCleanup_imp::GBblockBC (
    CGB_block& gbk
)

{
    CLEAN_STRING_LIST (gbk, Extra_accessions);

    if (! EXTRAACCN_ON_GENBANKBLOCK_IS_SORTED (gbk, s_AccessionCompare)) {
        SORT_EXTRAACCN_ON_GENBANKBLOCK (gbk, s_AccessionCompare);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    if (! EXTRAACCN_ON_GENBANKBLOCK_IS_UNIQUE (gbk, s_AccessionEqual)) {
        UNIQUE_EXTRAACCN_ON_GENBANKBLOCK (gbk, s_AccessionEqual);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    // split keywords at semicolons
    if (gbk.IsSetKeywords()) {
        string one_string = NStr::Join(gbk.GetKeywords(), ";");
        gbk.ResetKeywords();
        NStr::Split(one_string, ";", gbk.SetKeywords());
    }

    CLEAN_STRING_LIST (gbk, Keywords);

    CCachedRegexp reassembly_regex
        = regexpCache.Get("^tpa(?:_|[_:]re)assembly$",
                          CRegexp::fCompile_ignore_case);
    EDIT_EACH_KEYWORD_ON_EMBLBLOCK(keyword_it, gbk) {
        string & sKeyword = *keyword_it;
        if( reassembly_regex->IsMatch(sKeyword) ) {
            sKeyword = "TPA:assembly";
            ChangeMade (CCleanupChange::eCleanQualifiers);
        }
    }

    if( m_IsEmblOrDdbj ) {
        UNIQUE_WITHOUT_SORT_KEYWORD_ON_GENBANKBLOCK( gbk, PCase );
    } else {
        UNIQUE_WITHOUT_SORT_KEYWORD_ON_GENBANKBLOCK( gbk, PNocase );
    }

    CLEAN_STRING_MEMBER_JUNK (gbk, Source);
    if( FIELD_EQUALS(gbk, Source, ".") ) {
        RESET_FIELD(gbk, Source);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }
    if( FIELD_EQUALS(gbk, Origin, ".") ) {
        RESET_FIELD(gbk, Origin);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }

    CLEAN_STRING_MEMBER (gbk, Date);
    CLEAN_STRING_MEMBER (gbk, Div);
    CLEAN_STRING_MEMBER (gbk, Taxonomy);
}

void CNewCleanup_imp::EMBLblockBC (
    CEMBL_block& emb
)

{
    CLEAN_STRING_LIST (emb, Extra_acc);

    if (! EXTRAACCN_ON_EMBLBLOCK_IS_SORTED (emb, s_AccessionCompare)) {
        SORT_EXTRAACCN_ON_EMBLBLOCK (emb, s_AccessionCompare);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    if (! EXTRAACCN_ON_EMBLBLOCK_IS_UNIQUE (emb, s_AccessionEqual)) {
        UNIQUE_EXTRAACCN_ON_EMBLBLOCK (emb, s_AccessionEqual);
        ChangeMade (CCleanupChange::eCleanQualifiers);
    }

    CLEAN_STRING_LIST (emb, Keywords);

    UNIQUE_WITHOUT_SORT_KEYWORD_ON_EMBLBLOCK (emb, PCase);
}


// Give it a map that maps case-insensitive string to some other type, 
// and it will return any matches that are a prefix for str.
// For example, if you have a mapping that includes ("foo" to 7), then passing
// str as "Foo something", will return the ("foo" to 7) mapping.
template< typename TMapType >
typename TMapType::const_iterator s_FindInMapAsPrefix( const string &str_arg, const TMapType &the_map )
{
    // holds the str we're looking at, which might be str_arg, or
    // might be another string constructed from it
    const string *str = &str_arg;

    // use this to delete strings created in this function, if any.
    // we don't read from it directly
    auto_ptr<string> temp_str;

    // chop off characters that can't be in the map, so they don't count
    SIZE_TYPE first_bad_char = 0;
    for( ; first_bad_char < str_arg.length(); ++first_bad_char ) {
        const char ch = str_arg[first_bad_char];
        if( ! isalnum(ch) && ch != '-' && ch != '_' && ch != ' ' ) {
            temp_str.reset( new string(str_arg, 0, first_bad_char) );
            str = temp_str.get();
            break;
        }
    }

    typename TMapType::const_iterator it = the_map.lower_bound( *str );
    if( it != the_map.begin() && ( it == the_map.end() || ! NStr::EqualNocase(*str, it->first) ) ) {
        --it;
    }
    if ( it != the_map.end() && NStr::StartsWith(*str, it->first, NStr::eNocase)) {
        return it;
    }
    return the_map.end();
}

// s_FindInMapAsPrefix, but for data structures like sets.
template< typename TSetType >
typename TSetType::const_iterator s_FindInSetAsPrefix( const string &str, const TSetType &the_set )
{
    typename TSetType::const_iterator it = the_set.lower_bound( str );
    if( it != the_set.begin() && ( it == the_set.end() || ! NStr::EqualNocase(str, *it) ) ) {
        --it;
    }
    if ( it != the_set.end() && NStr::StartsWith(str, *it, NStr::eNocase)) {
        return it;
    }
    return the_set.end();
}


// copy "str" because we're changing it anyway
// returns true if we found anything
static
bool s_StringHasOrgModPrefix(const string &str, string::size_type &out_val_start_pos, TORGMOD_SUBTYPE &out_subtype)
{
    SIZE_TYPE pos = str.find_first_of(": ="), pos2;
    if (pos != 0  &&  pos != NPOS
        &&  (pos2 = str.find_first_not_of(": =", pos)) != NPOS) {
        try {
            string val = str.substr(0, pos);
            COrgMod::TSubtype subtype = COrgMod::GetSubtypeValue(val, COrgMod::eVocabulary_insdc);
            if ( !COrgMod::IsDiscouraged(subtype) ) {
                out_subtype       = subtype;
                out_val_start_pos = pos2;
                return true;
            }
        } catch (CSerialException&) {
        }
    }
    return false;
}

// returns true if we found anything
static
bool s_StringHasSubSourcePrefix(const string &str, string::size_type &out_val_start_pos, TSUBSOURCE_SUBTYPE &out_subtype)
{
    SIZE_TYPE pos = str.find_first_of(": ="), pos2;
    if (pos != 0  &&  pos != NPOS
        &&  (pos2 = str.find_first_not_of(": =", pos)) != NPOS) {
        try {
            string val = str.substr(0, pos);
            CSubSource::TSubtype subtype;
            if (NStr::EqualNocase(val, "Lat-long") || NStr::EqualNocase(val, "Latitude-Longitude")) {
                subtype = CSubSource::eSubtype_lat_lon;
            } else {
                subtype = CSubSource::GetSubtypeValue(val, CSubSource::eVocabulary_insdc);
            }
            if ( subtype == CSubSource::eSubtype_fwd_primer_name ||
                 subtype == CSubSource::eSubtype_fwd_primer_seq ||
                 subtype == CSubSource::eSubtype_rev_primer_name ||
                 subtype == CSubSource::eSubtype_rev_primer_seq ||
                 !CSubSource::IsDiscouraged(subtype) ) {
                out_subtype       = subtype;
                out_val_start_pos = pos2;
                return true;
            }
        } catch (CSerialException&) {
        }
    } else {
        // did not find delimiters
        try {
            CSubSource::TSubtype subtype = CSubSource::GetSubtypeValue(str);
            if ( !CSubSource::IsDiscouraged(subtype) && CSubSource::NeedsNoText(subtype)) {
                out_subtype       = subtype;
                out_val_start_pos = str.length();
                return true;
            }
        } catch (CSerialException&) {
        }
    }
    return false;
}


// is st1 < st2

static bool s_SubsourceCompare (
    const CRef<CSubSource>& st1,
    const CRef<CSubSource>& st2
)

{
    const CSubSource& sbs1 = *(st1);
    const CSubSource& sbs2 = *(st2);

    TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
    TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);

    if (chs1 < chs2) return true;
    if (chs1 > chs2) return false;

    if (FIELD_IS_SET (sbs2, Name)) {
        if (! FIELD_IS_SET (sbs1, Name)) return true;
        if (s_CompareNoCaseCStyle(GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name)) < 0) return true;
    }

    return false;
}

// Two SubSource's are equal and duplicates if:
// they have the same subtype
// and the same name (or don't require a name).

static bool s_SubsourceEqual (
    const CRef<CSubSource>& st1,
    const CRef<CSubSource>& st2
)

{
    const CSubSource& sbs1 = *(st1);
    const CSubSource& sbs2 = *(st2);

    TSUBSOURCE_SUBTYPE chs1 = GET_FIELD (sbs1, Subtype);
    TSUBSOURCE_SUBTYPE chs2 = GET_FIELD (sbs2, Subtype);

    if (chs1 != chs2) return false;
    if (CSubSource::NeedsNoText (chs2)) return true;

    if (FIELD_IS_SET (sbs1, Name) && FIELD_IS_SET (sbs2, Name)) {
        if (NStr::EqualNocase (GET_FIELD (sbs1, Name), GET_FIELD (sbs2, Name))) return true;
    }
    if (! FIELD_IS_SET (sbs1, Name) && ! FIELD_IS_SET (sbs2, Name)) return true;

    return false;
}

void CNewCleanup_imp::BiosourceFeatBC (
    CBioSource& biosrc,
    CSeq_feat & seqfeat
)
{
    // consolidate all orgmods of subtype "other" into one
    CRef<COrgMod> pFirstOtherOrgMod;
    if (biosrc.IsSetOrg() && biosrc.GetOrg().IsSetOrgname() && biosrc.GetOrg().GetOrgname().IsSetMod()) {
        auto& mod_set = biosrc.SetOrg().SetOrgname().SetMod();
        auto mod_it = mod_set.begin();
        while (mod_it != mod_set.end()) {
            COrgMod & orgmod = **mod_it;

            // we're only cleaning the ones of type "other"
            if (!FIELD_EQUALS(orgmod, Subtype, NCBI_ORGMOD(other)) ||
                !FIELD_IS_SET(orgmod, Subname))
            {
                ++mod_it;
                continue;
            }

            if (pFirstOtherOrgMod) {
                STRING_FIELD_APPEND(*pFirstOtherOrgMod, Subname, "; ", GET_STRING_FLD_OR_BLANK(orgmod, Subname));
                ChangeMade(CCleanupChange::eChangeOrgmod);
                mod_it = mod_set.erase(mod_it);
                ChangeMade(CCleanupChange::eRemoveOrgmod);
            } else {
                pFirstOtherOrgMod.Reset(&orgmod);
                ++mod_it;
            }
        }
    }

    // consolidate all subsources of subtype "other" into one
    CRef<CSubSource> pFirstOtherSubSource;
    EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        CSubSource &subsrc = **subsrc_iter;

        // we're only cleaning the ones of type "other"
        if( ! FIELD_EQUALS(subsrc, Subtype, NCBI_SUBSOURCE(other) ) ||
            ! FIELD_IS_SET(subsrc, Name) ) 
        {
            continue;
        }

        if( pFirstOtherSubSource ) {
            STRING_FIELD_APPEND(*pFirstOtherSubSource, Name, "; ", GET_STRING_FLD_OR_BLANK(subsrc, Name) );
            ChangeMade(CCleanupChange::eChangeSubsource);
            ERASE_SUBSOURCE_ON_BIOSOURCE(subsrc_iter, biosrc);
            ChangeMade(CCleanupChange::eRemoveSubSource);
        } else {
            pFirstOtherSubSource.Reset( &subsrc );
        }
    }

    // transfer feat comment (if any) to the end of the last other subsource note
    if( FIELD_IS_SET(seqfeat, Comment) ) {

        if( ! pFirstOtherSubSource ) {
            // create an empty subsource note if none found
            pFirstOtherSubSource.Reset( new CSubSource );
            SET_FIELD(*pFirstOtherSubSource, Subtype, NCBI_SUBSOURCE(other) );
            ADD_SUBSOURCE_TO_BIOSOURCE(biosrc, pFirstOtherSubSource);
        }

        STRING_FIELD_APPEND(*pFirstOtherSubSource, Name, "; ", GET_FIELD(seqfeat, Comment));
        ChangeMade ( CCleanupChange::eChangeSubsource );
        RESET_FIELD(seqfeat, Comment);
        ChangeMade ( CCleanupChange::eChangeComment );
    }

    // special orgmod cleanup just for features (yes, is stupid, but is what C toolkit does)
    if (biosrc.IsSetOrg() && biosrc.GetOrg().IsSetMod()) {
        EDIT_EACH_MOD_ON_ORGREF (it, biosrc.SetOrg()) {
            if (x_CompressSpaces(*it)) {
                ChangeMade ( CCleanupChange::eTrimSpaces );
            }
        }
    }
}

static void s_CorrectTildes (
    string& str
)

{
#ifndef NCBI_OS_MSWIN
    NStr::ReplaceInPlace (str, "were ~25 cm in height (~3 weeks)", "were ~~25 cm in height (~~3 weeks)");
    NStr::ReplaceInPlace (str, "generally ~3 weeks", "generally ~~3 weeks");
    NStr::ReplaceInPlace (str, "sequencing (~4 96-well plates)", "sequencing (~~4 96-well plates)");
    NStr::ReplaceInPlace (str, "size distribution (~2 kb)", "size distribution (~~2 kb)");
    NStr::ReplaceInPlace (str, "sequencing (~3 96-well plates)", "sequencing (~~3 96-well plates)");
    NStr::ReplaceInPlace (str, "vector. 1~2 ul of ligated", "vector. 1~~2 ul of ligated");
    /*
    NStr::ReplaceInPlace (str, "Lambda FLC I.~Islet cells were provided", "Lambda FLC I.~~Islet cells were provided");
    */
    NStr::ReplaceInPlace (str, "different strains~of mice", "different strains of mice");
    NStr::ReplaceInPlace (str, "oligo-dT-NotI primer~(5'-biotin", "oligo-dT-NotI primer (5'-biotin");
    NStr::ReplaceInPlace (str, "sizes of 200~800 bp were purified", "sizes of 200~~800 bp were purified");
    NStr::ReplaceInPlace (str, "Tween 20 (~50 ml per tree)", "Tween 20 (~~50 ml per tree)");
    NStr::ReplaceInPlace (str, "the SMART approach (~http://www.evrogen.com", "the SMART approach (http://www.evrogen.com");
    NStr::ReplaceInPlace (str, "the morning (~10 am) with", "the morning (~~10 am) with");
    NStr::ReplaceInPlace (str, "(host) sequences (~10%)", "(host) sequences (~~10%)");
    /*
    NStr::ReplaceInPlace (str, "unidirectionally.~ High quality", "unidirectionally. High quality");
    NStr::ReplaceInPlace (str, "onlysubmitted.~ Average", "onlysubmitted. Average");
    */
    NStr::ReplaceInPlace (str, "Plasmid; ~The F03-1270", "Plasmid; The F03-1270");
    NStr::ReplaceInPlace (str, "using STS-PCR~from Eb", "using STS-PCR from Eb");
    NStr::ReplaceInPlace (str, "specific to~the Eb", "specific to the Eb");
    NStr::ReplaceInPlace (str, "side of insert);  , M.F., Lennon", "side of insert); Bonaldo, M.F., Lennon");
    NStr::ReplaceInPlace (str, "Uni-ZAP XR vector. 1~2 ul of", "Uni-ZAP XR vector. 1~~2 ul of");
    NStr::ReplaceInPlace (str, "from diploid~Secale montanum", "from diploid Secale montanum");
    NStr::ReplaceInPlace (str, "homology with~U43516,", "homology with U43516,");
    /*
    NStr::ReplaceInPlace (str, "from http//www.biobase.dk/~ddbase", "from http//www.biobase.dk/~~ddbase");
    */
    NStr::ReplaceInPlace (str, "plasmid; ~Assembled EST", "plasmid; Assembled EST");
    NStr::ReplaceInPlace (str, "databases.~Different cDNA", "databases. Different cDNA");
    NStr::ReplaceInPlace (str, "enzyme PstI.~DH5-alpha", "enzyme PstI. DH5-alpha");
    NStr::ReplaceInPlace (str, "as they~were prepared", "as they were prepared");
    NStr::ReplaceInPlace (str, "loci in~the genome", "loci in the genome");
    NStr::ReplaceInPlace (str, "P{CaSpeR}Cp1~50C (FBti0004219)", "P{CaSpeR}Cp1~~50C (FBti0004219)");
    NStr::ReplaceInPlace (str, "seedlings with 2~4 leaves", "seedlings with 2~~4 leaves");
    NStr::ReplaceInPlace (str, "tween 20 (~50mLs per tree)", "tween 20 (~~50mLs per tree)");
#endif //NCBI_OS_MSWIN
}


bool s_SameSubtype(const CSubSource& s1, const CSubSource& s2)
{
    if (!s1.IsSetSubtype() && !s2.IsSetSubtype()) {
        return true;
    } else if (!s1.IsSetSubtype() || !s2.IsSetSubtype()) {
        return false;
    } else {
        return s1.GetSubtype() == s2.GetSubtype();
    }
}


// close enough if second name contains the first
bool s_NameCloseEnough(const CSubSource& s1, const CSubSource& s2)
{
    if (!s1.IsSetName() && !s2.IsSetName()) {
        return true;
    } else if (!s1.IsSetName() || !s2.IsSetName()) {
        return false;
    }
    const string& n1 = s1.GetName();
    const string& n2 = s2.GetName();

    if (NStr::Equal(n1, n2)) {
        return true;
    } else {
        return false;
    }
}


void CNewCleanup_imp::SubSourceListBC(CBioSource& biosrc)
{
    if (!biosrc.IsSetSubtype()) {
        return;
    }

    // sort and remove duplicates.
    if (biosrc.IsSetSubtype() && biosrc.GetSubtype().size() > 1) {
        if (!SUBSOURCE_ON_BIOSOURCE_IS_SORTED(biosrc, s_SubsourceCompare)) {
            SORT_SUBSOURCE_ON_BIOSOURCE(biosrc, s_SubsourceCompare);
            ChangeMade(CCleanupChange::eCleanSubsource);
        }

        // remove duplicates and subsources that contain previous values
        CBioSource::TSubtype::iterator s = biosrc.SetSubtype().begin();
        CBioSource::TSubtype::iterator s_next = s;
        ++s_next;
        while (s_next != biosrc.SetSubtype().end()) {
            if (s_SameSubtype(**s, **s_next) && s_NameCloseEnough(**s, **s_next)) {
                s = biosrc.SetSubtype().erase(s);
                ChangeMade(CCleanupChange::eCleanSubsource);
            } else {
                ++s;
            }
            ++s_next;
        }
    }
}

static string s_RepairISOCollDateTimeString (string& date_string)
{
    vector<string> components;
    NStr::Split(date_string, "T", components);

    if (components.size() == 1) {
        return date_string;
    }

    if (components.size() == 2) {
        string dat = components[0];
        string tim = components[1];
        size_t zee = tim.length();
        if (zee > 4 && tim[zee-1] == 'Z' && tim[1] == ':') {
            return dat + "T" + "0" + tim;
        }
    }

    return date_string;
}

static string s_RepairISOCollDateTimePair (string& coll_date)
{
    vector<string> pieces;
    NStr::Split(coll_date, "/", pieces);

    if (pieces.size() == 1) {
        string newdate = s_RepairISOCollDateTimeString(pieces[0]);
    }

    if (pieces.size() == 2) {
        string fstdate = s_RepairISOCollDateTimeString(pieces[0]);
        string scddate = s_RepairISOCollDateTimeString(pieces[1]);
        return fstdate + "/" + scddate;
    }

    return coll_date;
}

void CNewCleanup_imp::BiosourceBC (
    CBioSource& biosrc
)
{
    if( FIELD_EQUALS( biosrc, Genome, CBioSource::eGenome_virion ) ) {
        RESET_FIELD( biosrc, Genome );
        ChangeMade ( CCleanupChange::eChangeBioSourceGenome );
    }

    if( FIELD_EQUALS( biosrc, Origin, NCBI_ORIGIN(unknown) ) ) {
        RESET_FIELD(biosrc, Origin);
        ChangeMade ( CCleanupChange::eChangeBioSourceOrigin );
    }

    // remove spaces and convert to lowercase in fwd_primer_seq and rev_primer_seq.
    if( FIELD_IS_SET(biosrc, Subtype) ) {
        SUBSOURCE_ON_BIOSOURCE_Type::iterator prev = 
            SUBSOURCE_ON_BIOSOURCE_Set(biosrc).end();
        EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
            CSubSource& sbs = **it;

            TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
            if (CSubSource::NeedsNoText (chs)) {
                // name is required - set it to empty string
                if( ! FIELD_IS_SET(sbs, Name) || ! GET_FIELD(sbs, Name).empty() ) {
                    SET_FIELD (sbs, Name, "");
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
                CLEAN_STRING_MEMBER(sbs, Attrib);
            } else {
                CLEAN_AND_COMPRESS_STRING_MEMBER(sbs, Name);
                if( ! FIELD_IS_SET(sbs, Name) ) {
                    // name must be set
                    SET_FIELD (sbs, Name, "");
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
                x_RemoveFlankingQuotes( GET_MUTABLE(sbs, Name) );
                CLEAN_STRING_MEMBER(sbs, Attrib);
            }

            if( chs == NCBI_SUBSOURCE(country) ) {
                string &country = GET_MUTABLE(sbs, Name);
                static const string kUSPrefix( "United States:" );
                if( NStr::EqualNocase(country, "United States") || 
                    NStr::EqualNocase(country, "United States of America") || 
                    NStr::EqualNocase(country, "U.S.A.") ) 
                {
                    country = "USA";
                    ChangeMade(CCleanupChange::eCleanSubsource);
                } else if( NStr::StartsWith(country, kUSPrefix, NStr::eNocase) ) {
                    country.replace( 0, kUSPrefix.length(), "USA:" );
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }

            }

            if( chs == NCBI_SUBSOURCE(altitude) ) {
                string &altitude = GET_MUTABLE(sbs, Name);

                // normalize units part (that is, the ending) if possible
                // (e.g. "meters", etc. to "m.")
                // Note that we do NOT count a match if it's just a number because 
                // we can't be sure that the submitter wasn't thinking "feet" or whatever.
                CCachedRegexp altitude_regex = regexpCache.Get(
                    "^([+-]?[0-9]+(\\.[0-9]+)?) ?(m|meter[s]?|metre[s]?)\\.?$",
                    CRegexp::fCompile_ignore_case );

                if( altitude_regex->IsMatch(altitude) ) {
                    string new_altitude = altitude_regex->GetSub(altitude, 1); 
                    new_altitude += " m";
                    if( altitude != new_altitude ) {
                        altitude = new_altitude;
                        ChangeMade(CCleanupChange::eCleanSubsource);
                    }
                }
            }

            /*
            if( chs == NCBI_SUBSOURCE(lat_lon) ) {
                string &lat_lon = GET_MUTABLE(sbs, Name);

                CCachedRegexp lat_lon_with_comma = regexpCache.Get(
                    "^[-.0-9]+ ., [-.0-9]+ .$");
                if( lat_lon_with_comma->IsMatch(lat_lon) ) {
                    // remove the comma
                    SIZE_TYPE comma_pos = lat_lon.find(',');
                    _ASSERT(comma_pos != NPOS );
                    lat_lon.erase(comma_pos, 1);
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
            }
            */

            if ( chs == NCBI_SUBSOURCE(collection_date) ) {
                string &coll_date = GET_MUTABLE(sbs, Name);
                string new_date = s_RepairISOCollDateTimePair(coll_date);
                if (!NStr::Equal(new_date, coll_date)) {
                    coll_date = new_date;
                    ChangeMade(CCleanupChange::eCleanSubsource);
                }
            }

           if ( chs == NCBI_SUBSOURCE(fwd_primer_seq) ||
                chs == NCBI_SUBSOURCE(rev_primer_seq) )
            {
                const string before = GET_FIELD (sbs, Name);
                CPCRPrimerSeq::Clean( GET_MUTABLE(sbs, Name) );
                const string& after = GET_FIELD (sbs, Name);
                if ( before != after ) {
                    ChangeMade (CCleanupChange::eCleanSubsource);
                }
            }

            // determine whether we should remove this subsource:
            if(  (! FIELD_IS_SET(sbs, Name) || GET_FIELD(sbs, Name).empty()) &&
                ! CSubSource::NeedsNoText( chs ) )
            {
                ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                ChangeMade(CCleanupChange::eCleanSubsource);
                continue;
            } else if( chs == NCBI_SUBSOURCE(plastid_name) &&
                STRING_FIELD_MATCH(sbs, Name, s_GenomeToPlastidName(biosrc) ) )
            {
                ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                ChangeMade(CCleanupChange::eCleanSubsource);
                continue;
            } else if( prev != SUBSOURCE_ON_BIOSOURCE_Set(biosrc).end() ) {
                TSUBSOURCE_SUBTYPE prev_chs = GET_FIELD (**prev, Subtype);
                const string &name = GET_FIELD(sbs, Name);
                const string &prev_name = GET_FIELD(**prev, Name);

                if ( (chs == prev_chs) &&
                    ( CSubSource::NeedsNoText(chs) ||
                    NStr::EqualNocase(prev_name, name) ||
                    (prev_chs == NCBI_SUBSOURCE(other) &&
                    NStr::Find(prev_name, name) != NPOS))) 
                {
                    ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                    ChangeMade(CCleanupChange::eCleanSubsource);
                    continue;
                } else if ( (chs == prev_chs) &&
                    prev_chs == NCBI_SUBSOURCE(other) &&
                    NStr::Find (name, prev_name) != NPOS )
                {
                    (**prev).Assign( sbs );
                    ERASE_SUBSOURCE_ON_BIOSOURCE(it, biosrc);
                    ChangeMade(CCleanupChange::eCleanSubsource);
                    continue;
                }
            }

            prev = it;
        }
    }

    // sort and remove duplicates.
    SubSourceListBC(biosrc);

    // PCR Primers
    if( FIELD_IS_SET(biosrc, Pcr_primers) ) {
        PCRReactionSetBC( GET_MUTABLE(biosrc, Pcr_primers) );
        if( GET_FIELD(biosrc, Pcr_primers).Get().empty() ) {
            RESET_FIELD(biosrc, Pcr_primers);
            ChangeMade(CCleanupChange::eChangePCRPrimers);
        }
    }

    // correct specific cases of inconsistently applied tildes
    if (biosrc.IsSetOrg() && biosrc.GetOrg().IsSetOrgname()) {
        auto& orgname = biosrc.SetOrg().SetOrgname();
        if (orgname.IsSetMod()) {
            auto& mod_set = orgname.SetMod();
            for (auto& orgmod_it : mod_set) {
                COrgMod & orgmod = *orgmod_it;

                // we're only correcting tildes for the ones of type "other"
                if (!FIELD_EQUALS(orgmod, Subtype, NCBI_ORGMOD(other)) ||
                    !FIELD_IS_SET(orgmod, Subname))
                {
                    continue;
                }

                string &subname = GET_MUTABLE(orgmod, Subname);
                s_CorrectTildes(subname);
            }
        }
    }

    EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        CSubSource &subsrc = **subsrc_iter;

        // we're only correcting tildes for the ones of type "other"
        if( ! FIELD_EQUALS(subsrc, Subtype, NCBI_SUBSOURCE(other) ) ||
            ! FIELD_IS_SET(subsrc, Name) ) 
        {
            continue;
        }

        string &name = GET_MUTABLE(subsrc, Name);
        s_CorrectTildes(name);
    }

    if (biosrc.IsSetOrg()) {
        if (biosrc.GetOrg().IsSetOrgname()) {
            OrgnameBC(biosrc.SetOrg().SetOrgname(), biosrc.SetOrg());
        }
    }

    if (biosrc.FixEnvironmentalSample()) {
        ChangeMade(CCleanupChange::eChangeSubsource);
    }
    if (biosrc.RemoveNullTerms()) {
        ChangeMade(CCleanupChange::eChangeBioSourceOther);
    }
    if (biosrc.FixGenomeForQualifiers()) {
        ChangeMade(CCleanupChange::eChangeBioSourceGenome);
    }

    x_PostBiosource(biosrc);
    if (biosrc.IsSetOrg()) {
        x_PostOrgRef(biosrc.SetOrg());
    }
}

void CNewCleanup_imp::x_PostBiosource( CBioSource& biosrc )
{
    if( FIELD_EQUALS(biosrc, Genome, NCBI_GENOME(unknown) ) ) {
        RESET_FIELD(biosrc, Genome);
        ChangeMade(CCleanupChange::eChangeBioSourceGenome);
    }

    if (BIOSOURCE_HAS_SUBSOURCE (biosrc)) {

        // remove plastid-name subsource if the value is the same as the biosource location
        const string &plastid_name = s_GenomeToPlastidName( biosrc );
        
        bool plasmid_subsource_found = false;
        EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
            CSubSource& sbs = **it;
            TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
            if (CSubSource::NeedsNoText (chs)) {
                if (sbs.IsSetName() && !NStr::IsBlank(sbs.GetName())) {
                    RESET_FIELD (sbs, Name);
                    SET_FIELD (sbs, Name, "");
                    ChangeMade (CCleanupChange::eCleanSubsource);
                }
            } else if (chs == NCBI_SUBSOURCE(plastid_name)) {
                // plasTid
                if (NStr::EqualNocase (GET_FIELD (sbs, Name), plastid_name)) {
                    ERASE_SUBSOURCE_ON_BIOSOURCE (it, biosrc);
                    ChangeMade (CCleanupChange::eCleanSubsource);
                }
            } else if ( chs == NCBI_SUBSOURCE(plasmid_name) ) {
                // plasMid
                plasmid_subsource_found = true;
            }
        }

        // set genome to "plasmid" under some conditions
        if( plasmid_subsource_found ) {
            if( ! FIELD_IS_SET(biosrc, Genome) || 
                GET_FIELD(biosrc, Genome) == NCBI_GENOME(unknown) || 
                GET_FIELD(biosrc, Genome) == NCBI_GENOME(genomic) ) 
            { 
                biosrc.SetGenome( NCBI_GENOME(plasmid) );
                ChangeMade(CCleanupChange::eChangeBioSourceGenome);
            }
        }

        // remove those with no name unless it has a subtype that doesn't need a name.
        EDIT_EACH_SUBSOURCE_ON_BIOSOURCE (it, biosrc) {
            CSubSource& sbs = **it;
            if (FIELD_IS_SET (sbs, Name) && ! GET_FIELD(sbs, Name).empty() ) continue;
            TSUBSOURCE_SUBTYPE chs = GET_FIELD (sbs, Subtype);
            if (CSubSource::NeedsNoText (chs)) continue;
            ERASE_SUBSOURCE_ON_BIOSOURCE (it, biosrc);
            ChangeMade (CCleanupChange::eCleanSubsource);
        }

        // sort and remove duplicates.
        if (! SUBSOURCE_ON_BIOSOURCE_IS_SORTED (biosrc, s_SubsourceCompare)) {
            SORT_SUBSOURCE_ON_BIOSOURCE (biosrc, s_SubsourceCompare);
            ChangeMade (CCleanupChange::eCleanSubsource);
        }

        if (! SUBSOURCE_ON_BIOSOURCE_IS_UNIQUE (biosrc, s_SubsourceEqual)) {
            UNIQUE_SUBSOURCE_ON_BIOSOURCE (biosrc, s_SubsourceEqual);
            ChangeMade (CCleanupChange::eCleanSubsource);
        }

        REMOVE_IF_EMPTY_SUBSOURCE_ON_BIOSOURCE(biosrc);
    }
}




static bool s_DbtagIsBad (
    CDbtag& dbt
)

{
    if (! FIELD_IS_SET (dbt, Db)) return true;
    const string& db = GET_FIELD(dbt, Db);
    if (NStr::IsBlank (db)) return true;
    if( NStr::EqualNocase(db, "PID") ||
        NStr::EqualNocase(db, "PIDg") ||
        NStr::EqualNocase(db, "NID") ) {
            return true;
    }

    if (! FIELD_IS_SET( dbt, Tag)) return true;
    const CObject_id& oid = GET_FIELD(dbt, Tag);

    if (FIELD_IS (oid, Id)) {
        if (GET_FIELD (oid, Id) == 0) return true;
    } else if (FIELD_IS (oid, Str)) {
        const string& str = GET_FIELD (oid, Str);
        if (NStr::IsBlank (str)) return true;
    } else return true;

    return false;
}

void CNewCleanup_imp::OrgrefModBC (string& str)
{
    if (TrimSpacesSemicolonsAndCommas(str)) {
        ChangeMade (CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::OrgrefBC (COrg_ref& org)

{
    CLEAN_STRING_MEMBER (org, Taxname);
    CLEAN_STRING_MEMBER (org, Common);
    CLEAN_STRING_LIST (org, Syn);

    if (FIELD_IS_SET (org, Orgname)) {
        COrgName& onm = GET_MUTABLE (org, Orgname);
        OrgnameBC (onm, org);
    }


    if (ORGREF_HAS_DBXREF (org)) {
        
        vector< CRef< CDbtag > > new_dbtags;
        EDIT_EACH_DBXREF_ON_ORGREF (it, org) {
            CDbtag& dbt = **it;
            DbtagBC(dbt);
            x_SplitDbtag(dbt, new_dbtags );
        }
        if( ! new_dbtags.empty() ) {
            copy( new_dbtags.begin(), new_dbtags.end(), back_inserter( org.SetDb() ) );
            ChangeMade (CCleanupChange::eChangeDbxrefs);
        }
    }
}

void CNewCleanup_imp::x_PostOrgRef( COrg_ref& org )
{
    EDIT_EACH_DBXREF_ON_ORGREF (it, org) {
        CDbtag& dbt = **it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_ORGREF (it, org);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/unique db_xrefs
    if (! DBXREF_ON_ORGREF_IS_SORTED (org, s_DbtagCompare)) {
        SORT_DBXREF_ON_ORGREF (org, s_DbtagCompare);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    if (! DBXREF_ON_ORGREF_IS_UNIQUE (org, s_DbtagEqual)) {
        UNIQUE_DBXREF_ON_ORGREF (org, s_DbtagEqual);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }

    // sort/unique syns
    if (! SYN_ON_ORGREF_IS_SORTED (org, s_OrgrefSynCompare)) {
        SORT_SYN_ON_ORGREF (org, s_OrgrefSynCompare);
        ChangeMade (CCleanupChange::eCleanOrgref);
    }
    if (! SYN_ON_ORGREF_IS_UNIQUE (org, s_OrgrefSynEqual)) {
        UNIQUE_SYN_ON_ORGREF (org, s_OrgrefSynEqual);
        ChangeMade (CCleanupChange::eCleanOrgref);
    }

}

// is om1 < om2
// to sort subtypes together.

static bool s_OrgModCompare (
    const CRef<COrgMod>& om1,
    const CRef<COrgMod>& om2
)

{
    const COrgMod& omd1 = *(om1);
    const COrgMod& omd2 = *(om2);

    // subtype comparison
    TORGMOD_SUBTYPE subtype1 = GET_FIELD (omd1, Subtype);
    TORGMOD_SUBTYPE subtype2 = GET_FIELD (omd2, Subtype);
    if (subtype1 < subtype2) return true;
    if (subtype1 > subtype2) return false;

    // subname comparison
    const string& subname1 = GET_FIELD (omd1, Subname);
    const string& subname2 = GET_FIELD (omd2, Subname);
    const int subname_comparison = NStr::CompareNocase( subname1, subname2 );
    if( subname_comparison < 0 ) {
        return true;
    } else if( subname_comparison > 0 ) {
        return false;
    }

    // attrib comparison (realistically, we don't expect to fall back to this)
    const string& attrib1 = ( FIELD_IS_SET(omd1, Attrib) ? GET_FIELD (omd1, Attrib) : kEmptyStr );
    const string& attrib2 = ( FIELD_IS_SET(omd2, Attrib) ? GET_FIELD (omd2, Attrib) : kEmptyStr );

    const int attrib_comparison = NStr::CompareNocase( attrib1, attrib2 );
    if (attrib_comparison < 0) {
        return true;
    } else {
        return false;
    }
}

// Two OrgMod's are equal and duplicates if:
// they have the same subname and same subtype

static bool s_OrgModEqual (
    const CRef<COrgMod>& om1,
    const CRef<COrgMod>& om2
)

{
    const COrgMod& omd1 = *(om1);
    const COrgMod& omd2 = *(om2);

    const string& subname1 = GET_FIELD (omd1, Subname);
    const string& subname2 = GET_FIELD (omd2, Subname);
    if (! NStr::EqualNocase (subname1, subname2)) return false;

    const string& attrib1 = ( FIELD_IS_SET(omd1, Attrib) ? GET_FIELD (omd1, Attrib) : kEmptyStr );
    const string& attrib2 = ( FIELD_IS_SET(omd2, Attrib) ? GET_FIELD (omd2, Attrib) : kEmptyStr );
    if (! NStr::EqualNocase (attrib1, attrib2)) return false;

    TORGMOD_SUBTYPE chs1 = GET_FIELD (omd1, Subtype);
    TORGMOD_SUBTYPE chs2 = GET_FIELD (omd2, Subtype);
    if (chs1 == chs2) return true;

    return false;
}

void CNewCleanup_imp::OrgnameBC (
    COrgName& onm, COrg_ref &org_ref
)

{
    CLEAN_STRING_MEMBER (onm, Attrib);
    CLEAN_STRING_MEMBER (onm, Lineage);
    CLEAN_STRING_MEMBER_JUNK (onm, Div);

    EDIT_EACH_ORGMOD_ON_ORGNAME (it, onm) {
        COrgMod& omd = **it;
        OrgmodBC (omd);
        if (! FIELD_IS_SET (omd, Subname) || NStr::IsBlank (GET_FIELD (omd, Subname))) {
            ERASE_ORGMOD_ON_ORGNAME (it, onm);
            ChangeMade (CCleanupChange::eRemoveOrgmod);
        }
    }

    // erase structured notes that already match value
    // (Note: This is O(N^2).  Maybe worth converting to a faster algo?)
    EDIT_EACH_ORGMOD_ON_ORGNAME (it, onm) {
        COrgMod& omd = **it;
        if (omd.GetSubtype() == NCBI_ORGMOD(other)) {
            bool do_erase = false;
            string val_name, otherval;
            NStr::SplitInTwo( omd.GetSubname(), " =:", val_name, otherval );
            try {
                COrgMod::TSubtype subtype = COrgMod::GetSubtypeValue(val_name);
                NStr::TruncateSpacesInPlace(otherval);                
                FOR_EACH_ORGMOD_ON_ORGNAME (match_it, onm) {
                    if ((*match_it)->GetSubtype() == subtype
                        && NStr::EqualCase((*match_it)->GetSubname(), otherval)) {
                        do_erase = true;
                        break;
                    }
                }
            } catch (CSerialException& ) {
            }

            if (do_erase) {
                ERASE_ORGMOD_ON_ORGNAME (it, onm);
                ChangeMade (CCleanupChange::eCleanOrgmod);
            }
        }
    }

    if (! ORGMOD_ON_ORGNAME_IS_SORTED (onm, s_OrgModCompare)) {
        SORT_ORGMOD_ON_ORGNAME (onm, s_OrgModCompare);
        ChangeMade (CCleanupChange::eCleanOrgmod);
    }

    // clean Orgmod list
    x_OrgnameModBC( onm, GET_STRING_FLD_OR_BLANK(org_ref, Common) );

    if (! ORGMOD_ON_ORGNAME_IS_SORTED (onm, s_OrgModCompare)) {
        SORT_ORGMOD_ON_ORGNAME (onm, s_OrgModCompare);
        ChangeMade (CCleanupChange::eCleanOrgmod);
    }

    if (! ORGMOD_ON_ORGNAME_IS_UNIQUE (onm, s_OrgModEqual)) {
        UNIQUE_ORGMOD_ON_ORGNAME (onm, s_OrgModEqual);
        ChangeMade (CCleanupChange::eCleanOrgmod);
    }

    REMOVE_IF_EMPTY_ORGMOD_ON_ORGNAME(onm);
}

static bool RemoveSpaceBeforeAndAfterColon (
    string& str
)

{
    // May need to create a custom implementation if this
    // regex becomes a bottleneck
    return s_RegexpReplace( str, "[ ]*:[ ]*", ":");
}

void CNewCleanup_imp::OrgmodBC (
    COrgMod& omd
)
{
    CLEAN_AND_COMPRESS_STRING_MEMBER (omd, Subname);
    if (FIELD_IS_SET (omd, Subname)) {
        x_TrimInternalSemicolonsMarkChanged( GET_MUTABLE(omd, Subname) );
        x_RemoveFlankingQuotes( GET_MUTABLE(omd, Subname) );
    }

    CLEAN_AND_COMPRESS_STRING_MEMBER (omd, Attrib);

    TORGMOD_SUBTYPE subtype = GET_FIELD (omd, Subtype);

    if( subtype == NCBI_ORGMOD(specimen_voucher) ||
        subtype == NCBI_ORGMOD(culture_collection) ||
        subtype == NCBI_ORGMOD(bio_material) )
    {
        if (FIELD_IS_SET (omd, Subname)) {
            string &subname = GET_MUTABLE (omd, Subname);
            const string::size_type old_len = subname.length();
            RemoveSpaceBeforeAndAfterColon (subname);
            NStr::ReplaceInPlace( subname, "::", ":", 0, 1 );
            if( old_len != subname.length() ) {
                ChangeMade (CCleanupChange::eTrimSpaces);
            }
        }
    }

    if (omd.RemoveAbbreviation()) {
        ChangeMade(CCleanupChange::eCleanOrgmod);
    }
}

bool s_IsAllDigits(const string& str)
{
    if (str.length() == 0) {
        return false;
    }
    bool all_digits = true;
    ITERATE(string, s, str) {
        if (!isdigit(*s)) {
            all_digits = false;
            break;
        }
    }
    return all_digits;
}

void CNewCleanup_imp::DbtagBC (
    CDbtag& dbtag
)

{
    if (! FIELD_IS_SET (dbtag, Db)) return;
    if (! FIELD_IS_SET (dbtag, Tag)) return;

    string& db = GET_MUTABLE (dbtag, Db);
    if (NStr::IsBlank (db)) return;

    size_t len = db.length();
    NStr::TruncateSpacesInPlace(db);
    if (len != db.length()) {
        ChangeMade(CCleanupChange::eTrimSpaces);
    }

    if (dbtag.GetTag().IsStr()) {
        if (TrimSpacesSemicolonsAndCommas(dbtag.SetTag().SetStr())) {
            ChangeMade(CCleanupChange::eTrimSpaces);
        }
    }

    if (NStr::EqualNocase(db, "Swiss-Prot")
        || NStr::EqualNocase (db, "SWISSPROT")
        || NStr::EqualNocase (db, "UniProt/Swiss-Prot")) {
        db = "UniProtKB/Swiss-Prot";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "SPTREMBL")  ||
               NStr::EqualNocase(db, "TrEMBL")  ||
               NStr::EqualNocase(db, "UniProt/TrEMBL") ) {
        db = "UniProtKB/TrEMBL";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "SUBTILIS")) {
        db = "SubtiList";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "LocusID")) {
        db = "GeneID";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "MaizeDB")) {
        db = "MaizeGDB";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "GeneW")) {
        db = "HGNC";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "MGD")) {
        db = "MGI";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "IFO")) {
        db = "NBRC";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase(db, "BHB") ||
        NStr::EqualNocase(db, "BioHealthBase")) {
        db = "IRD";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "GENEDB")) {
        db = "GeneDB";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "cdd")) {
        db = "CDD";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "FlyBase")) {
        db = "FLYBASE";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "GreengenesID")) {
        db = "Greengenes";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "HMPID")) {
        db = "HMP";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "ATCC (inhost)")) {
        db = "ATCC(in host)";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::Equal(db, "ATCC (dna)")) {
        db = "ATCC(dna)";
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    }

    CObject_id& oid = GET_MUTABLE (dbtag, Tag);

    if (FIELD_IS (oid, Id)) {
        const string& db = dbtag.GetDb();
        if (NStr::EqualNocase (db, "HGNC") || NStr::EqualNocase (db, "VGNC") || NStr::EqualNocase (db, "MGI") ) {
            int val = dbtag.GetTag().GetId();
            string str = db + ":" + NStr::IntToString(val);
            dbtag.SetTag().SetStr(str);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        }
    }

    if (! FIELD_IS (oid, Str)) return;

    string& str = GET_MUTABLE(oid, Str);
    if (NStr::IsBlank (str)) return;

    db = dbtag.GetDb();
    str = dbtag.GetTag().GetStr();
    if (NStr::EqualNocase(db, "HPRD") && NStr::StartsWith (str, "HPRD_")) {
        dbtag.SetTag().SetStr (str.substr (5));
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    } else if (NStr::EqualNocase (db, "MGI") ) {
        if (!NStr::Equal(db, "MGI")) {
            dbtag.SetDb("MGI");
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        }
        if(NStr::StartsWith (str, "MGI:")) {
            /*
            dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (4));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            */
        }
        else if (NStr::StartsWith(str, "MGD:")) {
            dbtag.SetTag().SetStr("MGI:" + dbtag.GetTag().GetStr().substr(4));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        } else if (NStr::StartsWith(str, "J:")) {
            if (s_IsAllDigits(str.substr(2))) {
                dbtag.SetTag().SetStr("MGI:");
            }
        } else {
            string newstr = "MGI:" + str;
            dbtag.SetTag().SetStr(newstr);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        }
    } else if (NStr::EqualNocase (db, "HGNC") ) {
        if(! NStr::StartsWith (str, "HGNC:")) {
            string newstr = "HGNC:" + str;
            dbtag.SetTag().SetStr(newstr);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            /*
            dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (5));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            */
        }
    } else if (NStr::EqualNocase (db, "VGNC") ) {
        if(! NStr::StartsWith (str, "VGNC:")) {
            string newstr = "VGNC:" + str;
            dbtag.SetTag().SetStr(newstr);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            /*
            dbtag.SetTag().SetStr (dbtag.GetTag().GetStr().substr (5));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            */
        }
    } else if (NStr::EqualNocase (db, "RGD") ) {
        if(NStr::StartsWith (str, "RGD:")) {
            dbtag.SetTag().SetStr (str.substr (4));
            ChangeMade(CCleanupChange::eChangeDbxrefs);
        }
    }

    /*
    // convert to number if all digits
    if (s_IsAllDigits(str) && !NStr::StartsWith(str, "0")) {
        try {
            // extract the part before the first space for conversion
            string::size_type pos_of_first_space = 0;
            while (pos_of_first_space < str.length() && !isspace(str[pos_of_first_space])) {
                ++pos_of_first_space;
            }
            CTempString sStrOfNum(str, 0, pos_of_first_space);

            // only convert str to int if it fits into the non-negative side
            // of an int.
            int value = NStr::StringToInt(sStrOfNum, NStr::fConvErr_NoThrow);
            if (value > 0) {
                dbtag.SetTag().SetId(NStr::StringToUInt(sStrOfNum));
                ChangeMade(CCleanupChange::eChangeDbxrefs);
            }
        } catch (CStringException&) {
            // just leave things as are
        }
    }
    */
}


void CNewCleanup_imp::PubdescBC (
    CPubdesc& pubdesc
)
{
    if (CCleanupPub::CleanPubdesc(pubdesc, m_StripSerial)) {
        ChangeMade(CCleanupChange::eChangePublication);
    }
    // need to construct m_PubToNewPubLabelMap separately
    if (pubdesc.IsSetPub()) {
        for (auto p : pubdesc.SetPub().Set()) {
            string new_label;
            p->GetLabel(&new_label, CPub::eContent, true);
            m_PubToNewPubLabelMap[p] = new_label;
        }
    }
}


typedef pair<string, CRef<CPub> >   TCit;
struct TSortCit {
    bool operator ()(const TCit& c1, const TCit& c2) const {

        // First, try to compare case-insensitively
        // (We compare as if it were all-caps to match C's behavior )
        const int label_compare_no_case =  s_CompareNoCaseCStyle(c1.first, c2.first);
        if( label_compare_no_case != 0 ) {
            return (label_compare_no_case < 0);
        }

        // if they're the same, try to compare case-sensitively
        const int label_compare_case = NStr::CompareCase( c1.first, c2.first );
        if( label_compare_case != 0 ) {
            return (label_compare_case < 0);
        }

        // if they're still the same, fall back on cit-gen titles, if possible
        return CitGenTitlesLess(*c1.second, *c2.second);
    }
    bool CitGenTitlesLess(const CPub& p1, const CPub& p2) const {
        if ( ! p1.IsGen()  || ! p2.IsGen() ) {
            return false;
        }
        const CCit_gen& g1 = p1.GetGen();
        const CCit_gen& g2 = p2.GetGen();
        if ( g1.IsSetTitle() != g2.IsSetTitle() ) {
            return (g1.IsSetTitle() || g2.IsSetTitle());
        } else if( ! g1.IsSetTitle() && ! g2.IsSetTitle() ) {
            return false;
        }
        return g1.GetTitle() < g2.GetTitle();
    }
};

static
bool cmpSortedvsOld(const TCit& e1, const CRef<CPub>& e2) {
    return e1.second == e2;
}

void CNewCleanup_imp::PubSetBC( CPub_set &pub_set )
{
    // The Pub-set should always be pub. Ignore if not.
    if( ! pub_set.IsPub() ) {
        return;
    }

    // sort and unique by putting everything into a set
    // indexed by a label generated for each CPub.
    typedef set<TCit, TSortCit> TCitSet;
    TCitSet cit_set;
    for (auto cit_it : pub_set.GetPub()) {
        string label;
        cit_it->GetLabel(&label, CPub::eContent, CPub::fLabel_Unique, CPub::eLabel_V1 );
        // the following line may fail due to dups 
        // (that's okay; it lets us automatically remove dups)
        cit_set.insert( TCit(label, cit_it) );
    }
    auto& publist = pub_set.SetPub();
    // Has anything been deleted, or has the order changed?
    if ( cit_set.size() != publist.size() ||
        ! equal(cit_set.begin(), cit_set.end(), publist.begin(), cmpSortedvsOld) )
    {
        // put everything left back into the feature's citation list.
        publist.clear();
        ITERATE (TCitSet, citset_it, cit_set) {
            publist.push_back(citset_it->second);
        }
        ChangeMade(CCleanupChange::eCleanCitonFeat);
    }
}


void CNewCleanup_imp::ImpFeatBC( CSeq_feat& feat )
{
    if( ! FIELD_IS_SET_AND_IS(feat, Data, Imp) ) {
        return;
    }

    CImp_feat &imf = GET_MUTABLE( feat.SetData(), Imp );

    CLEAN_STRING_MEMBER_JUNK(imf, Key);
    CLEAN_STRING_MEMBER(imf, Loc);
    CLEAN_STRING_MEMBER(imf, Descr);

    if (imf.IsSetKey() && CSeqFeatData::FixImportKey(imf.SetKey())) {
        ChangeMade(CCleanupChange::eChangeKeywords);
    }
    
    if ( FIELD_IS_SET(imf, Key) ) {
        const CImp_feat::TKey& key = GET_FIELD(imf, Key);
        if ( key == "satellite" && ! m_IsEmblOrDdbj ) {
            SET_FIELD(imf, Key, "repeat_region");
            ChangeMade(CCleanupChange::eChangeKeywords);

            CRef<CGb_qual> satellite_qual( new CGb_qual );
            satellite_qual->SetQual("satellite");
            string val;
            if( FIELD_IS_SET(feat, Comment) ) {
                val = x_ExtractSatelliteFromComment( GET_MUTABLE(feat, Comment) );
            }
            if( val.empty() ) {
                val = "satellite";
            }
            satellite_qual->SetVal( val );

            feat.SetQual().push_back( satellite_qual );
        } else if ( key == "LTR" ) {
            SET_FIELD(imf, Key, "repeat_region");
            ChangeMade(CCleanupChange::eChangeKeywords);

            CRef<CGb_qual> rpt_type_qual( new CGb_qual );
            rpt_type_qual->SetQual( "rpt_type" );
            rpt_type_qual->SetVal( "long_terminal_repeat" );

            feat.SetQual().push_back( rpt_type_qual );
        }

        CSeqFeatData::ESubtype subtype = feat.GetData().GetSubtype();
        if (CSeqFeatData::IsRegulatory(subtype)) {
            string regulatory_class = CSeqFeatData::GetRegulatoryClass(subtype);
            SET_FIELD(imf, Key, "regulatory");
            ChangeMade(CCleanupChange::eChangeKeywords);
            CRef<CGb_qual> regulatory_class_qual( new CGb_qual );
            regulatory_class_qual->SetQual("regulatory_class");
            if (NStr::IsBlank(regulatory_class)) {
                regulatory_class_qual->SetVal( "other" );
            } else {
                regulatory_class_qual->SetVal( regulatory_class );
            }
            feat.SetQual().push_back( regulatory_class_qual );
        }

        if( key == "repeat_region" && ! m_IsEmblOrDdbj ) {
            string val;
            if( FIELD_IS_SET(feat, Comment) ) {
                val = x_ExtractSatelliteFromComment( GET_MUTABLE(feat, Comment) );
            }
            if( ! val.empty() ) {
                CRef<CGb_qual> satellite_qual( new CGb_qual );
                satellite_qual->SetQual("satellite");
                satellite_qual->SetVal( val );

                feat.SetQual().push_back( satellite_qual );
                ChangeMade(CCleanupChange::eChangeKeywords);
            }
        }

        if( key == "CDS" ) {
            if( ! m_IsEmblOrDdbj ) {
                CRef<CCdregion> new_cdregion( new CCdregion );
                // get frame from location
                if( ! FIELD_EQUALS( feat, Pseudo, true ) &&
                    feat.IsSetLocation() &&
                    CCleanup::SetFrameFromLoc(*new_cdregion, feat.GetLocation(), *m_Scope)) {
                    ChangeMade(CCleanupChange::eChangeCdregion);
                }
                ChangeMade(CCleanupChange::eChangeKeywords);

                CdregionFeatBC( *new_cdregion, feat );
                feat.SetData().SetCdregion(*new_cdregion);
                return;
            }
        }
    }

    if( FIELD_IS_SET(imf, Loc) ) {
        if ( NStr::Find(imf.GetLoc(), "replace") != NPOS ) {
            x_AddReplaceQual(feat, imf.GetLoc());
            RESET_FIELD(imf, Loc);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if( FIELD_IS_SET(imf, Key) ) {
        const string &key = GET_FIELD(imf, Key);

        TRNAREF_TYPE rna_ref_type = NCBI_RNAREF(unknown);
        if ( key == "precursor_RNA" ) {
            rna_ref_type = NCBI_RNAREF(premsg);
        } else if ( key == "mRNA" ) {
            rna_ref_type = NCBI_RNAREF(mRNA);
        } else if ( key == "tRNA" ) {
            rna_ref_type = NCBI_RNAREF(tRNA);
        } else if ( key == "rRNA" ) {
            rna_ref_type = NCBI_RNAREF(rRNA);
        } else if ( key == "snRNA" ) {
            rna_ref_type = NCBI_RNAREF(snRNA);
        } else if ( key == "scRNA" ) {
            rna_ref_type = NCBI_RNAREF(scRNA);
        } else if ( key == "snoRNA" ) {
            rna_ref_type = NCBI_RNAREF(snoRNA);
        } else if ( key == "misc_RNA" ) {
            rna_ref_type = NCBI_RNAREF(other);
        }
        if (rna_ref_type != NCBI_RNAREF(unknown) ) {
            CRef<CRNA_ref> new_rna_ref( new CRNA_ref );
            new_rna_ref->SetType( rna_ref_type );
            feat.SetData().SetRna( *new_rna_ref );
            ChangeMade(CCleanupChange::eAddRNAref);
            x_CleanSeqFeatQuals(feat);
            RnaFeatBC(feat.SetData().SetRna(), feat);
        } else {
            TPROTREF_PROCESSED processed = NCBI_PROTREF(not_set);
            if ( key == "proprotein" ||  key == "preprotein" ) {
                processed = NCBI_PROTREF(preprotein);
            } else if ( key == "mat_peptide" ) {
                processed = NCBI_PROTREF(mature);
            } else if ( key == "sig_peptide" ) {
                processed = NCBI_PROTREF(signal_peptide);
            } else if ( key == "transit_peptide" ) {
                processed = NCBI_PROTREF(transit_peptide);
            } else if ( key == "propeptide" ) {
                processed = NCBI_PROTREF(propeptide);
            }
            if (processed != NCBI_PROTREF(not_set) || key == "Protein" ) {
                const CSeq_id* location_seq_id = ( feat.IsSetLocation() ? feat.GetLocation().GetId() : NULL );
                if( location_seq_id ) {
                    CBioseq_Handle bioseq_handle = m_Scope->GetBioseqHandle(*location_seq_id);
                    if ( bioseq_handle && bioseq_handle.IsAa() ) {
                        CRef<CProt_ref> new_prot_ref( new CProt_ref );
                        new_prot_ref->SetProcessed( processed );
                        if (feat.IsSetComment() && !NStr::IsBlank(feat.GetComment())) {
                            new_prot_ref->SetName().push_back(feat.GetComment());
                            feat.ResetComment();
                        }
                        feat.SetData().SetProt( *new_prot_ref );
                        ChangeMade(CCleanupChange::eAddProtFeat);
                        x_CleanSeqFeatQuals(feat);
                    }
                }
            }
        }
    }
}


typedef SStaticPair<const char*, CSeqFeatData::TSite>  TSiteElem;
static const TSiteElem sc_site_map[] = {
    { "acetylation", CSeqFeatData::eSite_acetylation },
    { "active", CSeqFeatData::eSite_active },
    { "amidation", CSeqFeatData::eSite_amidation },
    { "binding", CSeqFeatData::eSite_binding },
    { "blocked", CSeqFeatData::eSite_blocked },
    { "cleavage", CSeqFeatData::eSite_cleavage },
    { "dna binding", CSeqFeatData::eSite_dna_binding },
    { "dna-binding", CSeqFeatData::eSite_dna_binding },
    { "gamma carboxyglutamic acid", CSeqFeatData::eSite_gamma_carboxyglutamic_acid },
    { "gamma-carboxyglutamic-acid", CSeqFeatData::eSite_gamma_carboxyglutamic_acid },
    { "glycosylation", CSeqFeatData::eSite_glycosylation },
    { "hydroxylation", CSeqFeatData::eSite_hydroxylation },
    { "inhibit", CSeqFeatData::eSite_inhibit },
    { "lipid binding", CSeqFeatData::eSite_lipid_binding },
    { "lipid-binding", CSeqFeatData::eSite_lipid_binding },
    { "metal binding", CSeqFeatData::eSite_metal_binding },
    { "metal-binding", CSeqFeatData::eSite_metal_binding },
    { "methylation", CSeqFeatData::eSite_methylation },
    { "modifi", CSeqFeatData::eSite_modified },
    { "mutagenized", CSeqFeatData::eSite_mutagenized },
    { "myristoylation", CSeqFeatData::eSite_myristoylation },
    { "nitrosylation", CSeqFeatData::eSite_nitrosylation },
    { "np binding", CSeqFeatData::eSite_np_binding },
    { "np-binding", CSeqFeatData::eSite_np_binding },
    { "oxidative deamination", CSeqFeatData::eSite_oxidative_deamination },
    { "oxidative-deamination", CSeqFeatData::eSite_oxidative_deamination },
    { "phosphorylation", CSeqFeatData::eSite_phosphorylation },
    { "pyrrolidone carboxylic acid", CSeqFeatData::eSite_pyrrolidone_carboxylic_acid },
    { "pyrrolidone-carboxylic-acid", CSeqFeatData::eSite_pyrrolidone_carboxylic_acid },
    { "signal peptide", CSeqFeatData::eSite_signal_peptide },
    { "signal-peptide", CSeqFeatData::eSite_signal_peptide },
    { "sulfatation", CSeqFeatData::eSite_sulfatation },
    { "transit peptide", CSeqFeatData::eSite_transit_peptide },
    { "transit-peptide", CSeqFeatData::eSite_transit_peptide },
    { "transmembrane region", CSeqFeatData::eSite_transmembrane_region },
    { "transmembrane-region", CSeqFeatData::eSite_transmembrane_region }
};
typedef CStaticArrayMap<string, CSeqFeatData::TSite, PNocase> TSiteMap;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TSiteMap, sc_SiteMap, sc_site_map);

void CNewCleanup_imp::SiteFeatBC( const CSeqFeatData::ESite &site, CSeq_feat& feat )
{
    // If site set to "other", try to extract it from the comment
    if ( FIELD_IS_SET(feat, Comment)  &&
        (site == CSeqFeatData::TSite(0)  ||  site == CSeqFeatData::eSite_other)) 
    {
        // extract if comment starts with any informative possibilities listed in sc_SiteMap
        const string& comment = GET_FIELD(feat, Comment);
        TSiteMap::const_iterator it = s_FindInMapAsPrefix<TSiteMap>( comment, sc_SiteMap );
        if ( it != sc_SiteMap.end() ) {
            feat.SetData().SetSite(it->second);
            ChangeMade(CCleanupChange::eChangeSite);
            // erase the comment if it contains no further useful info aside from the site
            if (NStr::IsBlank(comment, it->first.length())  ||
                NStr::EqualNocase(comment, it->first.length(), NPOS, " site")) {
                    feat.ResetComment();
                    ChangeMade(CCleanupChange::eChangeComment);
            }
        }
    }
}

void CNewCleanup_imp::SeqLocBC( CSeq_loc &loc )
{
    switch (loc.Which()) {
    case CSeq_loc::e_Int :
        x_SeqIntervalBC( GET_MUTABLE(loc, Int) );
        break;
    case CSeq_loc::e_Packed_int :
        {
            CSeq_loc::TPacked_int::Tdata& ints = loc.SetPacked_int().Set();
            NON_CONST_ITERATE(CSeq_loc::TPacked_int::Tdata, interval_it, ints) {
                x_SeqIntervalBC(**interval_it);
            }
            if (ints.size() == 1) {
                CRef<CSeq_interval> int_ref = ints.front();
                loc.SetInt(*int_ref);
                ChangeMade(CCleanupChange::eChangeSeqloc);
            }
        }
        break;
    case CSeq_loc::e_Pnt :
        {
            CSeq_loc::TPnt& pnt = loc.SetPnt();
            
            // change both and both-rev to plus and minus, respectively
            if (pnt.CanGetStrand()) {
                ENa_strand strand = pnt.GetStrand();
                if( strand == eNa_strand_unknown ) {
                    pnt.ResetStrand();
                    ChangeMade(CCleanupChange::eChangeStrand);
                }
            }

            // normalize Seq-point fuzz tl to tr and decrement position
            if (pnt.IsSetFuzz() && pnt.GetFuzz().IsLim() &&
                pnt.GetFuzz().GetLim() == CInt_fuzz::eLim_tl) {
                TSeqPos pos = pnt.GetPoint();
                if (pos > 0) {
                    pnt.SetFuzz().SetLim(CInt_fuzz::eLim_tr);
                    pnt.SetPoint(pos - 1);
                    ChangeMade(CCleanupChange::eChangeSeqloc);
                }
            }
        }
        break;
    case CSeq_loc::e_Mix :
        {
            typedef CSeq_loc::TMix::Tdata TMixList;
            // delete Null type Seq-locs from beginning and end of Mix list.

            // deleting from beginning:
            TMixList& sl_list = loc.SetMix().Set();
            TMixList::iterator sl_it = sl_list.begin();
            while (sl_it != sl_list.end()) {
                if ((*sl_it)->IsNull()) {
                    sl_it = sl_list.erase(sl_it);
                    ChangeMade(CCleanupChange::eChangeSeqloc);
                } else {
                    break;
                }
            }

            // deleting from end:
            if( sl_list.size() > 0 ) {
                sl_it = sl_list.end();
                while (sl_it != sl_list.begin()) {
                    --sl_it;
                    if ( ! (*sl_it)->IsNull()) {
                        break;
                    }
                }
                ++sl_it;
                if (sl_it != sl_list.end()) {
                    sl_list.erase(sl_it, sl_list.end());
                    ChangeMade(CCleanupChange::eChangeSeqloc);            
                }
            }

            if (sl_list.size() == 0) {
                loc.SetNull();
                ChangeMade(CCleanupChange::eChangeSeqloc);
            } else if (sl_list.size() == 1) {
                CRef<CSeq_loc> only_sl = sl_list.front();
                loc.Assign(*only_sl);
                ChangeMade(CCleanupChange::eChangeSeqloc);
            }
        }
        break;
    default:
        break;
    }

    // don't allow strandedness on protein sequences
    {
        CBioseq_Handle bsh;
        if (m_Scope) {
            ITERATE( CSeq_loc, loc_ci, loc ) {
                bsh = m_Scope->GetBioseqHandle(loc_ci.GetSeq_id());
                if( bsh ) {
                    break;
                }
            }
        }
        if ( bsh && bsh.IsProtein() && FIELD_IS_SET(loc, Strand) ) { 
            RESET_FIELD(loc, Strand);
            ChangeMade(CCleanupChange::eChangeStrand);
        }
    }

}

void CNewCleanup_imp::ConvertSeqLocWholeToInt( CSeq_loc &loc )
{
    if (loc.IsWhole()  &&  m_Scope) {

        // change the Seq-loc/whole to a Seq-loc/interval which covers the whole sequence.
        CRef<CSeq_id> id(new CSeq_id());
        id->Assign(loc.GetWhole());
        CBioseq_Handle bsh;

        if( id ) {
            bsh = m_Scope->GetBioseqHandle(*id);
        }
        if (bsh) {
            TSeqPos bs_len = bsh.GetBioseqLength();
            auto& interval = loc.SetInt();
            interval.SetId(*id);
            interval.SetFrom(0);
            interval.SetTo(bs_len - 1);
            ChangeMade(CCleanupChange::eChangeWholeLocation);
        }
    }
}

static void 
s_AddSeqLocMix( CSeq_loc_mix::Tdata & new_mix_pieces, 
               CSeq_loc_mix::Tdata & mix_pieces, 
               bool any_nulls_seen )
{
    NON_CONST_ITERATE( CSeq_loc_mix::Tdata, old_mix_iter, mix_pieces ) {
        CRef<CSeq_loc> old_piece( *old_mix_iter );
        if( old_piece->IsNull() ) {
            // ignore
        } else if( old_piece->IsMix() ) {
            s_AddSeqLocMix( new_mix_pieces, old_piece->SetMix(), 
                any_nulls_seen );
        } else {
            if( any_nulls_seen && ! new_mix_pieces.empty() ) {
                CRef<CSeq_loc> null_piece( new CSeq_loc );
                null_piece->SetNull();
                new_mix_pieces.push_back( null_piece );
            }
            new_mix_pieces.push_back( old_piece );
        }
    }
}

void CNewCleanup_imp::SeqLocMixBC( CSeq_loc_mix & loc_mix )
{
    if( ! loc_mix.IsSet() || loc_mix.Set().empty() ) {
        return;
    }

    // This function does two things simultaneously:
    // It checks for mix-inside-mix and also checks if 
    // we need to do "NULL-normalization"
    bool have_seen_inner_mix = false;
    bool any_nulls_seen = false;
    bool alternates_not_null_then_null = true;

    CSeq_loc_mix::Tdata & mix_pieces = loc_mix.Set();
    if( (mix_pieces.size() % 2) == 0 ) {
        // can't do notnull-null-notnull-null-notnull-....-null-notnull
        // if we have an even number of items
        alternates_not_null_then_null = false;
    }

    bool last_piece_was_null = true;
    ITERATE( CSeq_loc_mix::Tdata, outer_mix_iter, mix_pieces ) {
        const CSeq_loc &this_piece = **outer_mix_iter;
        const bool this_piece_is_null = this_piece.IsNull();

        // see if we've found any NULLs in this loc
        if( this_piece_is_null ) {
            any_nulls_seen = true;
        }

        // see if we break alternation of notnull and null
        if( alternates_not_null_then_null ) {
            if( this_piece_is_null == last_piece_was_null ) {
                // two of the same kind in a row: does not alternate
                alternates_not_null_then_null = false;
            }
        }

        // see if there's a nested mix in here
        if( this_piece.IsMix() ) {
            have_seen_inner_mix = true;
            alternates_not_null_then_null = false; // mix breaks alternation
            // We have to check if the inner-mix contains any NULLs
            if( ! any_nulls_seen ) {
                CSeq_loc_CI inner_ci( this_piece, CSeq_loc_CI::eEmpty_Allow );
                for( ; inner_ci; ++inner_ci ) {
                    if( inner_ci.IsEmpty() ) {
                        any_nulls_seen = true;
                    }
                }
            }
        }

        // for next iteration
        last_piece_was_null = this_piece_is_null;
    }

    // we've examined the location, so if there are any problems, we have
    // to rebuild it.
    if( have_seen_inner_mix || 
        (any_nulls_seen && ! alternates_not_null_then_null) ) 
    {
        CSeq_loc_mix new_mix;
        CSeq_loc_mix::Tdata & new_mix_pieces = new_mix.Set();

        // has to be in a separate function because it's recursive
        s_AddSeqLocMix( new_mix_pieces, mix_pieces, any_nulls_seen );

        // swap is faster than assignment
        loc_mix.Set().swap( new_mix_pieces );
    }
}

static bool s_IsJustQuotes (const string& str)

{
    FOR_EACH_CHAR_IN_STRING (str_itr, str) {
        const char& ch = *str_itr;
        if (ch > ' ' && ch != '"' && ch != '\'') return false;
    }
    return true;
}

void CNewCleanup_imp::GBQualBC (
    CGb_qual& gbq
)

{
    CLEAN_STRING_MEMBER (gbq, Qual);
    if (! FIELD_IS_SET (gbq, Qual)) {
        SET_FIELD (gbq, Qual, kEmptyStr);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    if (FIELD_IS_SET (gbq, Val)) {
        const string::size_type old_length = gbq.GetVal().length();
        CleanVisString (gbq.SetVal());
        TrimInternalSemicolons (gbq.SetVal());
        x_CompressSpaces( gbq.SetVal() );
        if (gbq.GetVal().length() != old_length) {
            ChangeMade (CCleanupChange::eTrimSpaces);
        }
    }
    if (FIELD_IS_SET (gbq, Val) && s_IsJustQuotes (GET_FIELD (gbq, Val))) {
        SET_FIELD (gbq, Val, kEmptyStr);
        ChangeMade (CCleanupChange::eCleanDoubleQuotes);
    }
    if (! FIELD_IS_SET (gbq, Val)) {
        SET_FIELD (gbq, Val, kEmptyStr);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    _ASSERT (FIELD_IS_SET (gbq, Qual) && FIELD_IS_SET (gbq, Val));

    if (NStr::EqualNocase(gbq.GetQual(), "rpt_unit_seq")) {
        if (x_IsBaseRange(gbq.GetVal())) {
            gbq.SetQual("rpt_unit_range");
            CGb_qual::CleanupRptUnitRange(gbq.SetVal());
            ChangeMade(CCleanupChange::eChangeQualifiers);
        } else if (CGb_qual::CleanupRptUnitSeq(gbq.SetVal())) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
        x_CleanupRptUnit(gbq);
    } else if (NStr::EqualNocase(gbq.GetQual(), "rpt_unit_range")) {
        if (! x_IsBaseRange(gbq.GetVal())) {
            gbq.SetQual("rpt_unit_seq");
            CGb_qual::CleanupRptUnitSeq(gbq.SetVal());
            ChangeMade(CCleanupChange::eChangeQualifiers);
        } else if (CGb_qual::CleanupRptUnitRange(gbq.SetVal())) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if (NStr::EqualNocase(gbq.GetQual(), "rpt_unit")) {
        if (x_CleanupRptUnit(gbq)) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if (NStr::EqualNocase(gbq.GetQual(), "replace")) {
        if (CGb_qual::CleanupReplace(gbq.SetVal())) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if (NStr::EqualNocase(gbq.GetQual(), "repeat_type")) {
        if (CGb_qual::FixRptTypeValue(gbq.SetVal())) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if (NStr::EqualNocase(gbq.GetQual(), "regulatory_class")) {
        if (CSeqFeatData::FixRegulatoryClassValue(gbq.SetVal())) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if (NStr::EqualNocase(gbq.GetQual(), "pseudogene")) {
        if (CGb_qual::FixPseudogeneValue(gbq.SetVal())) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    }


    x_ChangeTransposonToMobileElement(gbq);
    x_ChangeInsertionSeqToMobileElement(gbq);

    if (NStr::EqualNocase(GET_FIELD(gbq, Qual), "mobile_element")) {
        SET_FIELD( gbq, Qual, "mobile_element_type" );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
    if (NStr::EqualNocase(gbq.GetQual(), "mobile_element_type") &&
        gbq.IsSetVal() &&
        CGb_qual::FixMobileElementValue(gbq.SetVal())) {
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

static 
const char *s_FindImpFeatType( const CImp_feat &imp )
{
    // keep sorted in ASCII-betical order
    static const char *allowed_types[] = { 
        "-10_signal",     "-35_signal",   "3'UTR",          "3'clip",         "5'UTR",          
        "5'clip",         "CAAT_signal",  "CDS",            "C_region",       "D-loop",         
        "D_segment",      "GC_signal",    "Import",         "J_segment",      "LTR",            
        "N_region",       "RBS",          "STS",            "S_region",       "Site-ref",       
        "TATA_signal",    "V_region",     "V_segment",      "allele",         "attenuator",     
        "centromere",     "conflict",     "enhancer",       "exon",           "gap",            
        "iDNA",           "intron",       "mat_peptide",    "misc_RNA",       "misc_binding",   
        "misc_difference","misc_feature", "misc_recomb",    "misc_signal",    "misc_structure", 
        "mobile_element", "modified_base","mutation",       "old_sequence",   "operon",         
        "oriT",           "polyA_signal", "polyA_site",     "precursor_RNA",  "prim_transcript",
        "primer_bind",    "promoter",     "protein_bind",   "regulatory",     "rep_origin",
        "repeat_region",  "repeat_unit",  "satellite",      "sig_peptide",    "source",
        "stem_loop",      "telomere",     "terminator",     "transit_peptide","unsure",
        "variation",      "virion"
    };
    static const int kAllowedTypesNumElems = ( sizeof(allowed_types) / sizeof(allowed_types[0]));

    static const char *kFeatBad = "???";
    
    if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(imp, Key) ) {
        // the C logic is more complex than this
        const char *key = GET_FIELD(imp, Key).c_str();
        if( binary_search( allowed_types, allowed_types + kAllowedTypesNumElems,
            key, PCase_CStr() ) ) 
        {
            return key;
        }
    }

    return kFeatBad;
}

static 
const char *s_FindKeyFromFeatDefType( const CSeq_feat &feat )
{
    static const char *kFeatBad = "???";
    const CSeqFeatData& fdata = feat.GetData();

    switch (fdata.Which()) {
        case NCBI_SEQFEAT(Gene):
            return "Gene";
        case NCBI_SEQFEAT(Org):
            return "Org";
        case NCBI_SEQFEAT(Cdregion):
            return "CDS";
        case NCBI_SEQFEAT(Prot):
            if(fdata.GetProt().IsSetProcessed() ) {
                switch( feat.GetData().GetProt().GetProcessed() ) {
                case NCBI_PROTREF(not_set):
                    return "Protein";
                case NCBI_PROTREF(preprotein):
                    return "proprotein";
                case NCBI_PROTREF(mature):
                    return "mat_peptide";
                case NCBI_PROTREF(signal_peptide):
                    return "sig_peptide";
                case NCBI_PROTREF(transit_peptide):
                    return "transit_peptide";
                case NCBI_PROTREF(propeptide):
                    return "propeptide";
                default:
                    return kFeatBad;
                }
            }
            return "Protein";
        case NCBI_SEQFEAT(Rna):
            if(fdata.GetRna().IsSetType() ) {
                const auto& rna = fdata.GetRna();
                switch (rna.GetType() )
                {
                case NCBI_RNAREF(unknown):
                        return "misc_RNA"; // unknownrna mapped to otherrna
                case NCBI_RNAREF(premsg):
                    return "precursor_RNA";
                case NCBI_RNAREF(mRNA):
                    return "mRNA";
                case NCBI_RNAREF(tRNA):
                    return "tRNA";
                case NCBI_RNAREF(rRNA):
                    return "rRNA";
                case NCBI_RNAREF(snRNA):
                    return "snRNA";
                case NCBI_RNAREF(scRNA):
                    return "scRNA";
                case NCBI_RNAREF(snoRNA):
                    return "snoRNA";
                case NCBI_RNAREF(ncRNA):
                    return "ncRNA";
                case NCBI_RNAREF(tmRNA):
                    return "tmRNA";
                case NCBI_RNAREF(miscRNA):
                    return "misc_RNA";
                case NCBI_RNAREF(other):
                    if ( FIELD_IS_SET_AND_IS(rna, Ext, Name) ) {
                        const string &name = rna.GetExt().GetName();
                        if ( NStr::EqualNocase(name, "misc_RNA")) return "misc_RNA";
                        if ( NStr::EqualNocase(name, "ncRNA") ) return "ncRNA";
                        if ( NStr::EqualNocase(name, "tmRNA") ) return "tmRNA";
                    }
                    return "misc_RNA";
                default:
                    return kFeatBad;
                }
            }
            return kFeatBad;
        case NCBI_SEQFEAT(Pub):
            return "Cit";
        case NCBI_SEQFEAT(Seq):
            return "Xref";
        case NCBI_SEQFEAT(Imp):
            return s_FindImpFeatType( fdata.GetImp() );
        case NCBI_SEQFEAT(Region):
            return "Region";
        case NCBI_SEQFEAT(Comment):
            return "Comment";
        case NCBI_SEQFEAT(Bond):
            return "Bond";
        case NCBI_SEQFEAT(Site):
            return "Site";
        case NCBI_SEQFEAT(Rsite):
            return "Rsite";
        case NCBI_SEQFEAT(User):
            return "User";
        case NCBI_SEQFEAT(Txinit):
            return "TxInit";
        case NCBI_SEQFEAT(Num):
            return "Num";
        case NCBI_SEQFEAT(Psec_str):
            return "SecStr";
        case NCBI_SEQFEAT(Non_std_residue):
            return "NonStdRes";
        case NCBI_SEQFEAT(Het):
            return "Het";
        case NCBI_SEQFEAT(Biosrc):
            return "Src";
        case NCBI_SEQFEAT(Clone):
            return "CloneRef";
        case NCBI_SEQFEAT(Variation):
            return "VariationRef";
        default:
            return kFeatBad;
    }
    return kFeatBad;
}


static bool SetExceptFromGbqual(const CGb_qual& gb_qual, CSeq_feat& feat)
{
    bool rval = false;
    if (!feat.IsSetExcept() || !feat.GetExcept()) {
        feat.SetExcept(true);
        rval = true;
    }

    if (!gb_qual.IsSetQual()) {
        return rval;
    }
    if (feat.IsSetExcept_text() && !NStr::IsBlank(feat.GetExcept_text())) {
        return rval;
    }
    // for whatever reason, C Toolkit only sets text if Gbqual was blank
    if (gb_qual.IsSetVal() && !NStr::IsBlank(gb_qual.GetVal())) {
        return rval;
    }
    string exc = gb_qual.GetQual();
    NStr::ReplaceInPlace (exc, "-", " ");
    NStr::ReplaceInPlace (exc, "_", " ");
    feat.SetExcept_text(exc);
    return true;
}


static bool s_StringsAreEquivalent(const string& str1, const string& str2)
{
    string s1 = NStr::Replace(str1, " ", "_");
    NStr::ReplaceInPlace(s1, "-", "_");
    string s2 = NStr::Replace(str2, " ", "_");
    NStr::ReplaceInPlace(s2, "-", "_");
    return NStr::EqualNocase(s1, s2);
}


CNewCleanup_imp::EAction CNewCleanup_imp::GBQualSeqFeatBC(CGb_qual& gb_qual, CSeq_feat& feat)
{
    if( ! FIELD_IS_SET(feat, Data) ) {
        return eAction_Nothing;
    }
    CSeqFeatData &data = GET_MUTABLE(feat, Data);

    string& qual = GET_MUTABLE(gb_qual, Qual);
    string& val  = GET_MUTABLE(gb_qual, Val);

    if( FIELD_EQUALS(feat, Pseudo, false) ) {
        RESET_FIELD(feat, Pseudo);
        ChangeMade (CCleanupChange::eChangeQualifiers);
    }

    if( FIELD_EQUALS(feat, Partial, false) ) {
        RESET_FIELD(feat, Partial);
        ChangeMade (CCleanupChange::eChangeQualifiers);
    }

    if (NStr::EqualNocase(qual, "cons_splice")) {
        return eAction_Erase;
    } else if (s_StringsAreEquivalent(qual, "ribosomal-slippage") ||
               s_StringsAreEquivalent(qual, "trans-splicing") || 
               s_StringsAreEquivalent(qual, "artificial-location")) {
        if (SetExceptFromGbqual(gb_qual, feat)) {
            ChangeMade (CCleanupChange::eChangeException);
        }
        return eAction_Erase;
    } else if (NStr::EqualNocase(qual, "partial")) {
        feat.SetPartial(true);
        ChangeMade(CCleanupChange::eChangeQualifiers);
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "evidence")) {
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "exception")) {
        if( ! FIELD_EQUALS(feat, Except, true ) ) {
            SET_FIELD(feat, Except, true);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
        if (!NStr::IsBlank(val)  &&  !NStr::EqualNocase(val, "true")) {
            if (!feat.IsSetExcept_text()) {
                feat.SetExcept_text(val);
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
        }
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "experiment")) {
        if (NStr::EqualNocase(val, "experimental evidence, no additional details recorded")) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
            return eAction_Erase;  // mark qual for deletion
        }
    } else if (NStr::EqualNocase(qual, "inference")) {
        if (NStr::EqualNocase(val, "non-experimental evidence, no additional details recorded")) {
            ChangeMade(CCleanupChange::eChangeQualifiers);
            return eAction_Erase;  // mark qual for deletion
        } else {
            x_CleanupAndRepairInference(val);
        }
    } else if (NStr::EqualNocase(qual, "note")  ||
               NStr::EqualNocase(qual, "notes")  ||
               NStr::EqualNocase(qual, "comment")) {
        if (!feat.IsSetComment()) {
            feat.SetComment(val);
        } else {
            (feat.SetComment() += "; ") += val;
        }
        ChangeMade(CCleanupChange::eChangeComment);
        ChangeMade(CCleanupChange::eChangeQualifiers);
        return eAction_Erase;  // mark qual for deletion
    } else if( NStr::EqualNocase(qual, "label") ) {
        if ( NStr::EqualNocase(val, s_FindKeyFromFeatDefType(feat)) ) {
            // skip label that is simply the feature key
        } else if ( ! FIELD_IS_SET(feat, Comment) || NStr::FindNoCase(GET_FIELD(feat, Comment), "label") == NPOS) {
            // if label is not already in comment, append
            if( GET_STRING_FLD_OR_BLANK(feat, Comment).empty() ) {
                SET_FIELD(feat, Comment, "label: " + val );
            } else {
                GET_MUTABLE(feat, Comment) += "; label: " + val;
            }
            ChangeMade(CCleanupChange::eChangeComment);
        }
        return eAction_Erase;
    } else if (NStr::EqualNocase(qual, "regulatory_class")) {
        string::size_type colon_pos = val.find_first_of(":");
        if (colon_pos != string::npos && ! NStr::StartsWith (val, "other:")) {
            string comment = val.substr( colon_pos + 1 );
            val.resize( colon_pos );
            if( GET_STRING_FLD_OR_BLANK(feat, Comment).empty() ) {
                SET_FIELD(feat, Comment, comment );
            } else {
                GET_MUTABLE(feat, Comment) += "; " + comment;
            }
            ChangeMade(CCleanupChange::eChangeComment);
        }
    } else if (NStr::EqualNocase(qual, "db_xref")) {
        string tag, db;
        if (NStr::SplitInTwo(val, ":", db, tag)) {
            CRef<CDbtag> dbp(new CDbtag);
            dbp->SetDb(db);
            dbp->SetTag().SetStr(tag);
            
            feat.SetDbxref().push_back(dbp);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            return eAction_Erase;  // mark qual for deletion
        }
    } else if (NStr::EqualNocase(qual, "gdb_xref")) {
        CRef<CDbtag> dbp(new CDbtag);
        dbp->SetDb("GDB");
        dbp->SetTag().SetStr(val);
        feat.SetDbxref().push_back(dbp);
        ChangeMade(CCleanupChange::eChangeDbxrefs);
        return eAction_Erase;  // mark qual for deletion
    } else if ( NStr::EqualNocase(qual, "pseudo") ) {
        feat.SetPseudo(true);
        ChangeMade(CCleanupChange::eChangeQualifiers);
        return eAction_Erase;  // mark qual for deletion
    } else if ( NStr::EqualNocase(qual, "pseudogene") )
    {
        if( ! FIELD_EQUALS(feat, Pseudo, true) ) {
            feat.SetPseudo(true);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }

        // lowercase pseudogene qual
        string new_val = val;
        NStr::ToLower(new_val);
        if( new_val != val ) {
            val = new_val;
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    } else if ( FIELD_IS(data, Gene)  &&  x_GeneGBQualBC( GET_MUTABLE(data, Gene), gb_qual) == eAction_Erase) {
        return eAction_Erase;  // mark qual for deletion
    } else if ( FIELD_IS(data, Cdregion)  &&  x_SeqFeatCDSGBQualBC(feat, GET_MUTABLE(data, Cdregion), gb_qual) == eAction_Erase ) {
        return eAction_Erase;  // mark qual for deletion
    } else if (data.IsRna()  &&  x_SeqFeatRnaGBQualBC(feat, data.SetRna(), gb_qual) == eAction_Erase) {
        return eAction_Erase;  // mark qual for deletion
    } else if (data.IsProt()  &&  x_ProtGBQualBC(data.SetProt(), gb_qual, eGBQualOpt_normal) == eAction_Erase) {
        return eAction_Erase;  // mark qual for deletion
    } else if (NStr::EqualNocase(qual, "gene")) {
        if (!NStr::IsBlank(val)) {
            CRef<CSeqFeatXref> xref(new CSeqFeatXref);
            xref->SetData().SetGene().SetLocus(val);
            feat.SetXref().insert(feat.SetXref().begin(), xref);
            ChangeMade(CCleanupChange::eCopyGeneXref);
            return eAction_Erase;  // mark qual for deletion
        }
    } else if (NStr::EqualNocase(qual, "codon_start")) {
        if (!data.IsCdregion()) {
            // not legal on anything but CDS, so remove it
            return eAction_Erase;  // mark qual for deletion
        }
    } else if ( NStr::EqualNocase(qual, "EC_number") ) {
        x_CleanupECNumber(val);
    } else if( qual == "satellite" ) {
        x_MendSatelliteQualifier( val );
    } else if ( NStr::EqualNocase(qual, "replace") && data.GetSubtype() == CSeqFeatData::eSubtype_variation) {
        string orig = val;
        NStr::ToLower(val);
        if (!NStr::Equal(orig, val)) {
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
    }
    else if (NStr::EqualNocase(qual, "recombination_class")) {
        if (CGb_qual::FixRecombinationClassValue(val)) {
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
    }


    if( NStr::EqualNocase( qual, "mobile_element_type" ) ) {
        // trim spaces around first colon but only if there are no colons
        // with spaces before and after
        if( NPOS != NStr::Find(val, " :") || NPOS != NStr::Find(val, ": ") ) {
            if( s_RegexpReplace( val, "[ ]*:[ ]*", ":", 1 ) ) {
                ChangeMade(CCleanupChange::eCleanQualifiers);
            }
        }

        if( data.IsImp() && STRING_FIELD_MATCH( data.GetImp(), Key, "repeat_region" ) && ! val.empty() ) {
            qual = "mobile_element_type";
            data.SetImp().SetKey( "mobile_element" );
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
    }

    // estimated_length must be a number or "unknown"
    if( NStr::EqualNocase( qual, "estimated_length" ) ) {
        if( ! s_IsAllDigits(val) && ! NStr::EqualNocase(val, "unknown") ) {
            val = "unknown";
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
    }

    // conflict is obsolete.  Make it misc_difference, but add a note 
    // to the feature comment as to what it used to be.
    if( data.IsImp() && STRING_FIELD_MATCH( data.GetImp(), Key, "conflict" ) ) {
        data.SetImp().SetKey( "misc_difference");
        if( feat.IsSetComment() ) {
            GET_MUTABLE(feat, Comment) = "conflict; " + GET_FIELD(feat, Comment);
        } else {
            SET_FIELD(feat, Comment, "conflict");
        }
        ChangeMade(CCleanupChange::eCleanQualifiers);
    }

    if( qual.empty() && val.empty() ) {
        return eAction_Erase;
    }

    return eAction_Nothing;
}

bool CNewCleanup_imp::x_IsDotBaseRange(const string& val)
{
    size_t pos = NStr::Find(val, "..");
    if (string::npos == pos) {
        return false;
    }
    try {
        long start = NStr::StringToLong(val.substr(0, pos));
        long stop = NStr::StringToLong(val.substr(pos + 2));
        if (start < 1 || stop < 1) {
            return false;
        }
    } catch (...) {
        return false;
    }
    return true;
}


bool CNewCleanup_imp::x_IsHyphenBaseRange(const string& val)
{
    size_t pos = NStr::Find(val, "-");
    if (string::npos == pos) {
        return false;
    }
    try {
        long start = NStr::StringToLong(val.substr(0, pos));
        long stop = NStr::StringToLong(val.substr(pos + 1));
        if (start < 1 || stop < 1) {
            return false;
        }
    } catch (...) {
        return false;
    }
    return true;
}


bool CNewCleanup_imp::x_IsBaseRange(const string& val)
{
    if (val.length() > 25) {
        return false;
    }
    if (x_IsDotBaseRange(val)) {
        return true;
    } else if (x_IsHyphenBaseRange(val)) {
        return true;
    } else {
        return false;
    }
}


bool CNewCleanup_imp::x_CleanupRptUnit(CGb_qual& gbq)
{
    CGb_qual::CleanupRptUnitRange(gbq.SetVal());
    if (x_IsBaseRange(gbq.GetVal())) {
        gbq.SetQual("rpt_unit_range");
        if (x_IsHyphenBaseRange(gbq.GetVal())) {
            NStr::ReplaceInPlace(gbq.SetVal(), "-", "..");
        }
    } else {
        gbq.SetQual("rpt_unit_seq");
        CGb_qual::CleanupRptUnitSeq(gbq.SetVal());
    }
    return true;
}

void CNewCleanup_imp::x_ChangeTransposonToMobileElement(CGb_qual& gbq)
//
//  As of Dec 2006, "transposon" is no longer legal as a qualifier. The replacement
//  qualifier is "mobile_element". In addition, the value has to be massaged to
//  indicate "integron" or "transposon".
//
{
    static const string integronValues[] = {
        "class I integron",
        "class II integron",
        "class III integron",
        "class 1 integron",
        "class 2 integron",
        "class 3 integron"
    };
    static const string* endIntegronValues 
        = integronValues + sizeof(integronValues)/sizeof(*integronValues);

    if (NStr::EqualNocase( GET_FIELD(gbq, Qual), "transposon")) {
        SET_FIELD( gbq, Qual, "mobile_element");

        // If the value is one of the IntegronValues, change it to "integron: class XXX":
        const string* pValue = std::find(integronValues, endIntegronValues, GET_FIELD(gbq, Val) );
        if ( pValue != endIntegronValues ) {
            string::size_type cutoff = pValue->find( " integron" );
            _ASSERT( cutoff != string::npos ); // typo in IntegronValues?
            SET_FIELD( gbq, Val, string("integron: ") + pValue->substr(0, cutoff) );
        }
        // Otherwise, just prefix it with "transposon: ":
        else {
            SET_FIELD( gbq, Val, string("transposon: ") + GET_FIELD(gbq, Val) );
        }
        
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

void CNewCleanup_imp::x_ChangeInsertionSeqToMobileElement(CGb_qual& gbq)
//
//  As of Dec 2006, "insertion_seq" is no longer legal as a qualifier. The replacement
//  qualifier is "mobile_element". In addition, the value has to be massaged to
//  reflect the "insertion_seq".
//
{
    if (NStr::EqualNocase( GET_FIELD(gbq, Qual), "insertion_seq")) {
        gbq.SetQual("mobile_element");
        gbq.SetVal( string("insertion sequence:") + GET_FIELD(gbq, Val) );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

static bool s_IsCompoundRptTypeValue( 
    const string& value )
//
//  Format of compound rpt_type values: (value[,value]*)
//
//  These are internal to sequin and are in theory cleaned up before the material
//  is released. However, some compound values have escaped into the wild and have 
//  not been retro-fixed yet (as of 2006-03-17).
//
{
    if (NStr::IsBlank(value) || value.length() < 3 || 
        !NStr::StartsWith(value, "(") || !NStr::EndsWith(value, ")")) {
        return false;
    }
    
    bool last_char_was_close_paren = false;
    string::const_iterator s = value.begin();
    ++s;
    while (s != value.end()) {
        if (*s == '(') {
            return false;
        } else if (last_char_was_close_paren) {
            return false;
        } else if (*s == ')') {
            last_char_was_close_paren = true;
        }
        ++s;
    }
    return true;
}

static
void s_ExpandThisQual( 
    CSeq_feat::TQual& quals,        // the list of CGb_qual's.
    CSeq_feat::TQual::iterator& it, // points to the one qual we might expand.
    CSeq_feat::TQual& new_quals )    // new quals that will need to be inserted
//
//  Rules for "rpt_type" qualifiers (as of 2006-03-07):
//
//  There can be multiple occurrences of this qualifier, and we need to keep them 
//  all.
//  The value of this qualifier can also be a *list of values* which is *not* 
//  conforming to the ASN.1 and thus needs to be cleaned up. 
//
//  The cleanup entails turning the list of values into multiple occurrences of the 
//  given qualifier, each occurrence taking one of the values in the original 
//  list.
//
{
    CGb_qual& qual = **it;
    string  qual_type = qual.GetQual();
    string& val = qual.SetVal();
    if (NStr::Equal(val, "()")) {
        val.clear();
        return;
    }
    if ( ! s_IsCompoundRptTypeValue( val ) ) {
        //
        //  nothing to do ...
        //
        return;
    }

    //
    //  Generate list of cleaned up values. Fix original qualifier and generate 
    //  list of new qualifiers to be added to the original list:
    //    
    vector< string > newValues;
    string valueList = val.substr(1, val.length() - 2);
    NStr::Split(valueList, ",", newValues, NStr::fSplit_Tokenize);
    
    qual.SetVal( newValues[0] );
   
    for ( size_t i=1; i < newValues.size(); ++i ) {
        CRef< CGb_qual > newQual( new CGb_qual() );
        newQual->SetQual( qual_type );
        newQual->SetVal( newValues[i] );
        new_quals.push_back( newQual ); 
    }
}


void CNewCleanup_imp::x_ExpandCombinedQuals(CSeq_feat::TQual& quals)
{
    CSeq_feat::TQual    new_quals;
    NON_CONST_ITERATE (CSeq_feat::TQual, it, quals) {
        CGb_qual& gb_qual = **it;

        string& qual = GET_MUTABLE(gb_qual, Qual);
        string& val  = GET_MUTABLE(gb_qual, Val);

        // convert curly braces to parens for some quals
        if( (val.length() > 1) && (val[0] == '{') &&
            (val[val.length()-1] == '}') ) 
        {
            val[0] = '(';
            val[val.length()-1] = ')';
            ChangeMade(CCleanupChange::eCleanQualifiers);
        }
        
        if (NStr::EqualNocase(qual, "rpt_type")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "rpt_unit")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "rpt_unit_range")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "rpt_unit_seq")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "usedin")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "old_locus_tag")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "compare")) {
            s_ExpandThisQual( quals, it, new_quals );
        } else if (NStr::EqualNocase(qual, "replace")) {
            s_ExpandThisQual( quals, it, new_quals );
        }
    }
    
    if ( ! new_quals.empty() ) {
        quals.insert(quals.end(), new_quals.begin(), new_quals.end());
        ChangeMade(CCleanupChange::eChangeQualifiers);
        NON_CONST_ITERATE (CSeq_feat::TQual, it, quals) {
            GBQualBC(**it);
        }
    }
}

CNewCleanup_imp::EAction 
CNewCleanup_imp::x_GeneGBQualBC( CGene_ref& gene, const CGb_qual& gb_qual )
{
    const string& qual = GET_FIELD(gb_qual, Qual);
    const string& val  = GET_FIELD(gb_qual, Val);

    if( NStr::IsBlank(val) ) {
        return eAction_Nothing;
    }

    bool change_made = false;
    if (NStr::EqualNocase(qual, "map")) {
        if (! gene.IsSetMaploc() ) {
            change_made = true;
            gene.SetMaploc(val);
        }
    } else if (NStr::EqualNocase(qual, "allele")) {
        if ( gene.IsSetAllele() ) {
            return ( NStr::EqualNocase(val, gene.GetAllele()) ? eAction_Erase : eAction_Nothing );
        } else {
            change_made = true;
            gene.SetAllele(val);
        }
    } else if (NStr::EqualNocase(qual, "locus_tag")) {
        if ( ! gene.IsSetLocus_tag() ) {
            change_made = true;
            gene.SetLocus_tag(val);
        }
    } else if (NStr::EqualNocase(qual, "gene_synonym")) {
        change_made = true;
        gene.SetSyn().push_back(val);
    }
    if (change_made) {
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    return ( change_made ? eAction_Erase : eAction_Nothing );
}

CNewCleanup_imp::EAction
CNewCleanup_imp::x_SeqFeatCDSGBQualBC(CSeq_feat& feat, CCdregion& cds, const CGb_qual& gb_qual)
{
    const string& qual = gb_qual.GetQual();
    const string& val  = gb_qual.GetVal();
    
    // transl_except qual -> Cdregion.code_break
    if (NStr::EqualNocase(qual, "transl_except")) {
        // could not be parsed earlier
        return eAction_Nothing;
    }

    // codon_start qual -> Cdregion.frame
    if (NStr::EqualNocase(qual, "codon_start")) {
        CCdregion::TFrame frame = GET_FIELD(cds, Frame);
        CCdregion::TFrame new_frame = CCdregion::TFrame(NStr::StringToNonNegativeInt(val));
        if (new_frame == CCdregion::eFrame_one  ||
            new_frame == CCdregion::eFrame_two  ||
            new_frame == CCdregion::eFrame_three) {
            if (frame == CCdregion::eFrame_not_set  ||
                ( FIELD_EQUALS( feat, Pseudo, true ) && ! FIELD_IS_SET(feat, Product) )) {
                cds.SetFrame(new_frame);
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
            return eAction_Erase;
        }
    }

    // transl_table qual -> Cdregion.code
    if (NStr::EqualNocase(qual, "transl_table")) {
        if ( FIELD_IS_SET(cds, Code) ) {
            const CCdregion::TCode& code = GET_FIELD(cds, Code);
            int transl_table = 1;
            ITERATE (CCdregion::TCode::Tdata, it, code.Get()) {
                if ( FIELD_IS(**it, Id)  &&  GET_FIELD(**it, Id) != 0) {
                    transl_table = GET_FIELD(**it, Id);
                    break;
                }
            }
            
            if (NStr::EqualNocase(NStr::UIntToString(transl_table), val)) {
                return eAction_Erase;
            }
        } else {
            int new_val = NStr::StringToNonNegativeInt(val);
            if (new_val > 0) {
                CRef<CGenetic_code::C_E> gc(new CGenetic_code::C_E);
                SET_FIELD(*gc, Id, new_val);
                cds.SetCode().Set().push_back(gc);
                
                // we don't have to check except-text because we're 
                // setting an unset genetic_code, not changing an existing one
                // (the except-text would be: "genetic code exception")
                ChangeMade(CCleanupChange::eChangeGeneticCode);
                return eAction_Erase;
            }
        }
    }

    // look for qualifiers that should be applied to protein feature
    // note - this should be moved to the "indexed" portion of basic cleanup,
    // because it needs to locate another sequence and feature
    if (NStr::Equal(qual, "product") || NStr::Equal (qual, "function") || NStr::EqualNocase (qual, "EC_number")
        || NStr::Equal (qual, "prot_note"))  
    {
        // get protein sequence for product
        CRef<CSeq_feat> prot_feat;
        CRef<CProt_ref> prot_ref;

        // try to get existing prot_feat
        CBioseq_Handle prot_handle;
        if ( FIELD_IS_SET(feat, Product) ) {
            const CSeq_id *prod_seq_id = feat.GetProduct().GetId();
            if( prod_seq_id != NULL ) {
                prot_handle = m_Scope->GetBioseqHandle(*prod_seq_id);
            }
        }
        if (prot_handle) {
            // find main protein feature
            CConstRef<CBioseq> pseq = prot_handle.GetCompleteBioseq();
            if (pseq && pseq->IsSetAnnot()) {
                for (auto ait : pseq->GetAnnot()) {
                    if (ait->IsFtable()) {
                        for (auto fit : ait->GetData().GetFtable()) {
                            if (fit->IsSetData() && fit->GetData().GetSubtype() == CSeqFeatData::eSubtype_prot) {
                                prot_feat.Reset(const_cast<CSeq_feat*>(fit.GetPointer()));
                                prot_ref.Reset(&(prot_feat->SetData().SetProt()));
                            }
                        }
                    }
                }
            }
        }

        bool push_back_xref_on_success = false;
        CRef<CSeqFeatXref> xref;
        if ( ! prot_ref ) {
            // otherwise make cross reference
            prot_ref.Reset( new CProt_ref );

            // see if this seq-feat already has a prot xref
            EDIT_EACH_SEQFEATXREF_ON_SEQFEAT( xref_iter, feat ) {
                if( (*xref_iter)->IsSetData() && (*xref_iter)->GetData().IsProt() ) {
                    xref = *xref_iter;
                }
            }
            // seq-feat has no prot xref. We make our own.
            if ( ! xref ) {
                xref.Reset( new CSeqFeatXref );
                xref->SetData().SetProt( *prot_ref );
                // we will push the xref onto the feat if the add was successful
                push_back_xref_on_success = true;
            }
            prot_ref.Reset( &xref->SetData().SetProt() );
        }

        // replacement prot feature
        EAction action = eAction_Nothing;

        if (NStr::Equal(qual, "prot_note") ) {
            if( prot_feat ) {
                if (!prot_feat->IsSetComment() || NStr::IsBlank (prot_feat->GetComment())) {
                    SET_FIELD( *prot_feat, Comment, val);
                } else {
                    SET_FIELD( *prot_feat, Comment, (prot_feat->GetComment() + "; " + val) );
                }
                ChangeMade (CCleanupChange::eChangeComment);
                action = eAction_Erase;
            }
        } else {
            action = x_ProtGBQualBC( *prot_ref, gb_qual, eGBQualOpt_CDSMode );
        }

        if( push_back_xref_on_success ) {
            feat.SetXref().push_back( xref );
            ChangeMade(CCleanupChange::eCleanSeqFeatXrefs);
        }

        return action;
    }

    if (NStr::EqualNocase(qual, "translation")) {
        return eAction_Erase;
    }

    return eAction_Nothing;
}

typedef SStaticPair<const char *, int> TTrnaKey;

static const TTrnaKey trna_key_to_subtype [] = {
    {  "Ala",            'A'  },
    {  "Alanine",        'A'  },
    {  "Arg",            'R'  },
    {  "Arginine",       'R'  },
    {  "Asn",            'N'  },
    {  "Asp",            'D'  },
    {  "Asp or Asn",     'B'  },
    {  "Asparagine",     'N'  },
    {  "Aspartate",      'D'  },
    { "Aspartic",        'D'  },
    { "Aspartic Acid",   'D'  },
    {  "Asx",            'B'  },
    {  "Cys",            'C'  },
    {  "Cysteine",       'C'  },
    {  "fMet",           'M'  },
    {  "Gln",            'Q'  },
    {  "Glu",            'E'  },
    {  "Glu or Gln",     'Z'  },
    {  "Glutamate",      'E'  },
    {  "Glutamic",       'E'  },
    {  "Glutamic Acid",  'E'  },
    {  "Glutamine",      'Q'  },
    {  "Glx",            'Z'  },
    {  "Gly",            'G'  },
    {  "Glycine",        'G'  },
    {  "His",            'H'  },
    {  "Histidine",      'H'  },
    {  "Ile",            'I'  },
    {  "Ile2",           'I'  },
    {  "iMet",           'M'  },
    {  "Isoleucine",     'I'  },
    {  "Leu",            'L'  },
    {  "Leu or Ile",     'J'  },
    {  "Leucine",        'L'  },
    {  "Lys",            'K'  },
    {  "Lysine",         'K'  },
    {  "Met",            'M'  },
    {  "Methionine",     'M'  },
    {  "OTHER",          'X'  },
    {  "Phe",            'F'  },
    {  "Phenylalanine",  'F'  },
    {  "Pro",            'P'  },
    {  "Proline",        'P'  },
    {  "Pyl",            'O'  },
    {  "Pyrrolysine",    'O'  },
    {  "Sec",            'U'  },
    {  "Selenocysteine", 'U'  },
    {  "Ser",            'S'  },
    {  "Serine",         'S'  },
    {  "Ter",            '*'  },
    {  "TERM",           '*'  },
    {  "Termination",    '*'  },
    {  "Thr",            'T'  },
    {  "Threonine",      'T'  },
    {  "Trp",            'W'  },
    {  "Tryptophan",     'W'  },
    {  "Tyr",            'Y'  },
    {  "Tyrosine",       'Y'  },
    {  "Val",            'V'  },
    {  "Valine",         'V'  },
    {  "Xle",            'J'  },
    {  "Xxx",            'X'  }
};

typedef CStaticPairArrayMap <const char*, int, PNocase_CStr> TTrnaMap;
DEFINE_STATIC_ARRAY_MAP(TTrnaMap, sm_TrnaKeys, trna_key_to_subtype);

// This maps in the opposite direction of sm_TrnaKeys
class CAminoAcidCharToSymbol : public multimap<char, const char*, PNocase_LessChar> 
{
public:
    CAminoAcidCharToSymbol( const TTrnaKey keys[], int num_keys )
    {
        int ii = 0;
        for( ; ii < num_keys; ++ii ) {
            insert(value_type( keys[ii].second, keys[ii].first ));
        }
    }
};
const static CAminoAcidCharToSymbol sm_TrnaInverseKeys
    ( trna_key_to_subtype, 
      (sizeof(trna_key_to_subtype) / sizeof(trna_key_to_subtype[0])) );

static CRef<CTrna_ext> s_ParseTRnaFromAnticodonString (const string &str, const CSeq_feat& feat, CScope *scope)
{
    CRef<CTrna_ext> trna;
    
    if (NStr::IsBlank (str)) return trna;

    if (NStr::StartsWith (str, "(pos:")) {
        // find position of closing paren
        string::size_type pos_end = s_MatchingParenPos( str, 0 );
        if (pos_end != string::npos) {
            trna.Reset( new CTrna_ext );
            string pos_str = str.substr (5, pos_end - 5);
            string::size_type aa_start = NStr::FindNoCase (pos_str, "aa:");
            if (aa_start != string::npos) {
                string abbrev = pos_str.substr (aa_start + 3);
                TTrnaMap::const_iterator t_iter = sm_TrnaKeys.find (abbrev.c_str ());
                if (t_iter == sm_TrnaKeys.end ()) {
                    // unable to parse
                    return trna;
                }
                CRef<CTrna_ext::TAa> aa(new CTrna_ext::TAa);
                aa->SetIupacaa (t_iter->second);
                trna->SetAa(*aa);
                pos_str = pos_str.substr (0, aa_start);
                NStr::TruncateSpacesInPlace (pos_str);
                if (NStr::EndsWith (pos_str, ",")) {
                    pos_str = pos_str.substr (0, pos_str.length() - 1);
                }
            }
            const CSeq_loc& loc = feat.GetLocation();
            CRef<CSeq_loc> anticodon = ReadLocFromText (pos_str, loc.GetId(), scope);
            if( anticodon ) {
                CBioseq_Handle bsh = scope->GetBioseqHandle(*(loc.GetId()));
                if (!bsh) {
                    trna.Reset(NULL);
                    return trna;
                }
                if (anticodon->GetStop(eExtreme_Positional) >= bsh.GetInst_Length()) {
                    trna.Reset(NULL);
                    return trna;
                }
                if (feat.GetLocation().IsSetStrand()) {
                    anticodon->SetStrand(loc.GetStrand());
                } else {
                    anticodon->SetStrand(eNa_strand_plus); // anticodon is always on plus strand
                }
            }
            if (anticodon == NULL) {
                trna->ResetAa();
            } else {
                trna->SetAnticodon(*anticodon);
            }
        }
    }
    return trna;        
}

static
char s_FindTrnaAA( const string &str )
{
    if ( str.empty() ) return '\0';
    string tmp = str;
    NStr::TruncateSpacesInPlace(tmp);
    
    if( tmp.length() == 1 ) {
        // if the string is a valid one-letter code, just return that
        const char aminoAcidLetter = toupper(tmp[0]);
        if( sm_TrnaInverseKeys.find(aminoAcidLetter) != sm_TrnaInverseKeys.end() ) {
            return aminoAcidLetter;
        }
    } else {
        // translate 3-letter codes and full-names to one-letter codes
        TTrnaMap::const_iterator trna_iter = sm_TrnaKeys.find (tmp.c_str ());
        if( trna_iter != sm_TrnaKeys.end() ) {
            return trna_iter->second;
        }
    }

    return '\0';
}

class CCharInSet {
public:
    CCharInSet( const string &list_of_characters ) {
        copy( list_of_characters.begin(), list_of_characters.end(),
            inserter( char_set, char_set.begin() ) );
    }

    bool operator()( const char ch ) const {
        return ( char_set.find(ch) != char_set.end() );
    }

private:
    set<char> char_set;
};

static
void s_TokenizeTRnaString (const string &tRNA_string, list<string> &out_string_list )
{
    out_string_list.clear();
    if ( tRNA_string.empty() ) return;

    // SGD Tx(NNN)c or Tx(NNN)c#, where x is the amino acid, c is the chromosome (A-P, Q for mito),
    // and optional # is presumably for individual tRNAs with different anticodons and the same
    // amino acid.
    CCachedRegexp valid_sgd_regex = regexpCache.Get(
        "^[Tt][A-Za-z]\\(...\\)[A-Za-z]\\d?\\d?$");
    if ( valid_sgd_regex->IsMatch(tRNA_string) ) {
        // parse SGD tRNA anticodon
        out_string_list.push_back(kEmptyStr);
        string &new_SGD_tRNA_anticodon = out_string_list.back();
        string raw_codon_part = tRNA_string.substr(3,3);
        NStr::ToUpper( raw_codon_part );
        string reverse_complement;
        CSeqManip::ReverseComplement( raw_codon_part, CSeqUtil::e_Iupacna, 0, 3, reverse_complement );
        new_SGD_tRNA_anticodon = string("(") + reverse_complement + ')';

        // parse SGD tRNA amino acid
        out_string_list.push_back(tRNA_string.substr(1,1));
        return;
    }

    string tRNA_string_copy = tRNA_string;
    // Note that we do NOT remove "*", since it might be a terminator tRNA symbol
    replace_if( tRNA_string_copy.begin(), tRNA_string_copy.end(), 
        CCharInSet("-,;:()=\'_~"), ' ' );

    vector<string> tRNA_tokens;
    // " \t\n\v\f\r" are the standard whitespace chars
    // ( source: http://www.cplusplus.com/reference/clibrary/cctype/isspace/ )
    NStr::Split(tRNA_string_copy, " \t\n\v\f\r", tRNA_tokens, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);

    EDIT_EACH_STRING_IN_VECTOR( tRNA_token_iter, tRNA_tokens ) {
        string &tRNA_token = *tRNA_token_iter;
        // remove initial "tRNA", if any
        if ( NStr::StartsWith(tRNA_token, "tRNA", NStr::eNocase) ) {
            tRNA_token = tRNA_token.substr(4);
        }
        CCachedRegexp threeLettersPlusDigits = regexpCache.Get(
            "^[A-Za-z][A-Za-z][A-Za-z]\\d*$");
        if (! tRNA_token.empty() ) {
            if ( threeLettersPlusDigits->IsMatch(tRNA_token) ) {
                tRNA_token = tRNA_token.substr(0, 3);
            }
            out_string_list.push_back(tRNA_token);
        }
    }
}


// based on C's ParseTRnaString
static 
char s_ParseSeqFeatTRnaString( const string &comment, bool *out_justTrnaText, string &tRNA_codon, bool noSingleLetter )
{
    if (out_justTrnaText != NULL) {
        *out_justTrnaText = false;
    }
    tRNA_codon.clear();

    if ( comment.empty() ) return '\0';

    CRef<CTrna_ext> tr( new CTrna_ext );

    char aa = '\0';
    list<string> head;
    s_TokenizeTRnaString (comment, head);
    bool justt = true;
    list<string>::const_iterator head_iter = head.begin();
    bool is_ambig = false;
    for( ; head_iter != head.end(); ++head_iter ) {
        const string &str = *head_iter;
        if( str.empty() ) continue;
        char curraa = '\0';
        if (noSingleLetter && str.length() == 1) {
            curraa = '\0';
        } else {
            curraa = s_FindTrnaAA (str);
        }
        if(curraa != '\0') {
            if (aa == '\0') {
                aa = curraa;
            } else if( curraa != aa) {
                is_ambig = true;
            }
        } else if ( ! NStr::EqualNocase ("tRNA", str) &&
            ! NStr::EqualNocase ("transfer", str) &&
            ! NStr::EqualNocase ("RNA", str) &&
            ! NStr::EqualNocase ("product", str) ) 
        {
            justt = false;
        }
    }
    if( is_ambig ) {
        aa = 0;
    }

    if (justt) {
        if( comment.find_first_of("0123456789") != string::npos ) {
            justt = false;
        }
    }
    if (out_justTrnaText != NULL) {
        *out_justTrnaText = justt;
    }
    return aa;
}


void CNewCleanup_imp::x_AddToComment(CSeq_feat& feat, const string& comment)
{
    if (!feat.IsSetComment()) {
        feat.SetComment(comment);
    }
    else {
        feat.SetComment() += "; " + comment;
    }
    ChangeMade(CCleanupChange::eChangeComment);
}

CNewCleanup_imp::EAction
CNewCleanup_imp::x_HandleTrnaProductGBQual(CSeq_feat& feat, CRNA_ref& rna, const string& product)
{
    CRNA_ref::TType& rna_type = rna.SetType();

    if (rna_type != CRNA_ref::eType_tRNA && 
        rna_type != CRNA_ref::eType_other &&
        rna_type != CRNA_ref::eType_unknown) {
        return eAction_Nothing;
    }

    if (rna_type == NCBI_RNAREF(tRNA) && rna.IsSetExt() && rna.GetExt().IsName()) {
        string name = rna.GetExt().GetName();
        bool justTrnaText = false;
        string codon;
        char aa = s_ParseSeqFeatTRnaString(name, &justTrnaText, codon, false);
        if (aa != '\0') {
            const bool is_fMet = (NStr::Find(name, "fMet") != NPOS);
            const bool is_iMet = (NStr::Find(name, "iMet") != NPOS);
            const bool is_Ile2 = (NStr::Find(name, "Ile2") != NPOS);
            CRNA_ref_Base::C_Ext::TTRNA &trp = rna.SetExt().SetTRNA();
            trp.SetAa().SetNcbieaa(aa);
            if (aa == 'M') {
                if (is_fMet) {
                    x_AddToComment(feat, "fMet");
                } else if (is_iMet) {
                    x_AddToComment(feat, "iMet");
                }
            } else if (aa == 'I') {
                if (is_Ile2) {
                    x_AddToComment(feat, "Ile2");
                }
            }
            x_SeqFeatTRNABC(feat, trp);
            ChangeMade(CCleanupChange::eChangeRNAref);
        }
    }
    if (rna_type == NCBI_RNAREF(tRNA) && !rna.IsSetExt()) {
        // this part inserted from: AddQualifierToFeature (sfp, "product", gb_qual_val);
        bool justTrnaText = false;
        string codon;
        char aa = s_ParseSeqFeatTRnaString(product, &justTrnaText, codon, false);
        if (aa != '\0') {

            CRNA_ref_Base::C_Ext::TTRNA& trna = rna.SetExt().SetTRNA();
            trna.SetAa().SetNcbieaa(aa);

            if (!justTrnaText || !NStr::IsBlank(codon)) {
                x_AddToComment(feat, product);
            }

            if (aa == 'M') {
                if (NStr::Find(product, "fMet") != NPOS &&
                    (!feat.IsSetComment() || NStr::Find(feat.GetComment(), "fMet") == NPOS)) {
                    // x_AddToComment(feat, "fMet");
                    ChangeMade(CCleanupChange::eChangeRNAref);
                    return eAction_Nothing;
                } else if (NStr::Find(product, "iMet") != NPOS &&
                    (!feat.IsSetComment() || NStr::Find(feat.GetComment(), "iMet") == NPOS)) {
                    // x_AddToComment(feat, "iMet");
                    ChangeMade(CCleanupChange::eChangeRNAref);
                    return eAction_Nothing;
                }
            } else if (aa == 'I') {
                if (NStr::Find(product, "Ile2") != NPOS &&
                    (!feat.IsSetComment() || NStr::Find(feat.GetComment(), "Ile2") == NPOS)) {
                    // x_AddToComment(feat, "Ile2");
                    ChangeMade(CCleanupChange::eChangeRNAref);
                    return eAction_Nothing;
                }
            }

            ChangeMade(CCleanupChange::eChangeRNAref);
        }
        else {
            x_AddToComment(feat, product);
        }
        return eAction_Erase;
    }
    if (rna_type == NCBI_RNAREF(tRNA) && rna.IsSetExt() && rna.GetExt().IsTRNA()) {
        CRNA_ref_Base::C_Ext::TTRNA& trp = rna.SetExt().SetTRNA();
        if (trp.IsSetAa() && trp.GetAa().IsNcbieaa()) {
            string ignored = kEmptyStr;
            if (trp.GetAa().GetNcbieaa() == s_ParseSeqFeatTRnaString(product, NULL, ignored, false) &&
                NStr::IsBlank(ignored)) {
            } else {
                // don't remove product qual because it conflicts with existing aa value
                return eAction_Nothing;
            }
            if (NStr::CompareNocase (product, "tRNA-fMet") == 0 || NStr::CompareNocase (product, "iRNA-fMet") == 0) {
                return eAction_Nothing;
            }
            if (NStr::CompareNocase (product, "tRNA-iMet") == 0 || NStr::CompareNocase (product, "iRNA-iMet") == 0) {
                return eAction_Nothing;
            }
            if (NStr::CompareNocase (product, "tRNA-Ile2") == 0 || NStr::CompareNocase (product, "iRNA-Ile2") == 0) {
                return eAction_Nothing;
            }
            return eAction_Erase;
        } else if (!trp.IsSetAa()) {
            string ignored = kEmptyStr; 
            bool justTrnaText = false;
            char aa = s_ParseSeqFeatTRnaString(product, &justTrnaText, ignored, false);
            if (aa != '\0') {
                trp.SetAa().SetNcbieaa(aa);
                if (!justTrnaText || !NStr::IsBlank(ignored)) {
                    x_AddToComment(feat, product);
                }
                if (NStr::CompareNocase(product, "tRNA-fMet") == 0 ||
                    NStr::CompareNocase(product, "iRNA-fMet") == 0 ||
                    NStr::CompareNocase(product, "tRNA-iMet") == 0 ||
                    NStr::CompareNocase(product, "iRNA-iMet") == 0 ||
                    NStr::CompareNocase(product, "tRNA-Ile2") == 0 ||
                    NStr::CompareNocase(product, "iRNA-Ile2") == 0) {
                    return eAction_Nothing;
                }
                return eAction_Erase;
            }
        }
    }

    if (rna.IsSetExt() && rna.GetExt().IsName() && NStr::Equal(rna.GetExt().GetName(), product)) {
        return eAction_Erase;
    }

    return eAction_Nothing;
}


CNewCleanup_imp::EAction CNewCleanup_imp::x_HandleStandardNameRnaGBQual(CSeq_feat& feat, CRNA_ref& rna, const string& standard_name)
{
    return eAction_Nothing;
}


// homologous to C's HandledGBQualOnRNA.
// That func was copy-pasted, then translated into C++.
// Later we can go back and actually refactor the code
// to make it more efficient or cleaner.
CNewCleanup_imp::EAction 
CNewCleanup_imp::x_SeqFeatRnaGBQualBC(CSeq_feat& feat, CRNA_ref& rna, CGb_qual& gb_qual)
{
    if( ! gb_qual.IsSetVal()) {
        return eAction_Nothing;
    }
    const string &gb_qual_qual = gb_qual.GetQual();
    string &gb_qual_val = gb_qual.SetVal();
    TRNAREF_TYPE& rna_type = rna.SetType();

    if (NStr::EqualNocase(gb_qual_qual, "standard_name")) {
        return x_HandleStandardNameRnaGBQual(feat, rna, gb_qual_val);
    }
    if (NStr::IsBlank(gb_qual_val)) {
        return eAction_Nothing;
    }

    if (NStr::EqualNocase( gb_qual_qual, "product" )) 
    {
        if (rna_type == NCBI_RNAREF(unknown)) {
            rna_type = NCBI_RNAREF(other);
            ChangeMade(CCleanupChange::eChangeRNAref);
        }
        if ( rna.IsSetExt() && rna.GetExt().IsName() ) {
            const string &name = rna.SetExt().SetName();
            if ( name.empty() ) {
                rna.ResetExt();
                ChangeMade(CCleanupChange::eChangeRNAref);
            }
        }
        if (x_HandleTrnaProductGBQual(feat, rna, gb_qual_val) == eAction_Erase) {
            return eAction_Erase;
        }

        if (!rna.IsSetExt()) {
            string remainder;
            rna.SetRnaProductName(gb_qual_val, remainder);
            ChangeMade(CCleanupChange::eChangeRNAref);
            if (NStr::IsBlank(remainder)) {
                return eAction_Erase;
            } else {
                gb_qual.SetQual(remainder);
                return eAction_Nothing;
            }
        }
        if( rna.GetExt().IsGen() ) {
            CRNA_gen & rna_gen = rna.SetExt().SetGen();
            if( RAW_FIELD_IS_EMPTY_OR_UNSET(rna_gen, Product) ) {
                rna_gen.SetProduct(gb_qual_val);
                ChangeMade(CCleanupChange::eChangeRNAref);
                return eAction_Erase;
            }
            return eAction_Nothing;
        }
        if (rna.GetExt().IsName() && NStr::Equal(rna.GetExt().GetName(), gb_qual_val)) {
            return eAction_Erase;
        }
        if ( rna.IsSetExt() && ! rna.GetExt().IsName() ) return eAction_Nothing;
        const string &name = ( rna.IsSetExt() ? rna.GetExt().GetName() : kEmptyStr );
        if (! name.empty() ) {
            SIZE_TYPE rDNA_pos = NStr::Find( gb_qual_val, "rDNA");
            if (rDNA_pos != NPOS) {
                gb_qual_val[rDNA_pos+1] = 'R';
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
            if ( NStr::EqualNocase(name, gb_qual_val) ) {
                return eAction_Erase;
            }
            if (rna_type == NCBI_RNAREF(other) || rna_type == NCBI_RNAREF(ncRNA) || 
                rna_type == NCBI_RNAREF(tmRNA) || rna_type == NCBI_RNAREF(miscRNA) ) 
            {
                // new convention follows ASN.1 spec comments, allows new RNA types
                return eAction_Nothing;
            }
            // subsequent /product now added to comment
            x_AddToComment(feat, gb_qual_val);
            ChangeMade(CCleanupChange::eChangeComment);
            return eAction_Erase;
        }
        if (rna_type == NCBI_RNAREF(ncRNA) || 
            rna_type == NCBI_RNAREF(tmRNA) || rna_type == NCBI_RNAREF(miscRNA) ) 
        {
            // new convention follows ASN.1 spec comments, allows new RNA types
            return eAction_Nothing;
        }
        if ( ! FIELD_CHOICE_EQUALS( rna, Ext, Name, gb_qual_val) ) {
            rna.SetExt().SetName( gb_qual_val );
            ChangeMade(CCleanupChange::eChangeRNAref);
            return eAction_Erase;
        }
    } else if (NStr::EqualNocase(gb_qual_qual, "anticodon") ) {
        if (!rna.IsSetType() || rna.GetType() == CRNA_ref::eType_unknown) {
            rna.SetType(CRNA_ref::eType_other);
            ChangeMade(CCleanupChange::eChangeKeywords);
        }
        if (rna.GetType() != CRNA_ref::eType_tRNA) {
            return eAction_Nothing;
        }

        CRef<CTrna_ext> trna = s_ParseTRnaFromAnticodonString(gb_qual.GetVal(), feat, m_Scope);
        if (!trna) {
            return eAction_Nothing;
        }

        x_SeqFeatTRNABC( feat, *trna );
        if (trna->IsSetAa() || trna->IsSetAnticodon()) {
            // don't apply at all if there are conflicts
            bool apply_aa = false;
            bool apply_anticodon = false;
            bool ok_to_apply = true;
                
            // look for conflict with aa
            if (!rna.IsSetExt() || !rna.GetExt().IsTRNA()) {
                if (trna->IsSetAa()) {
                    apply_aa = true;
                }
                if (trna->IsSetAnticodon()) {
                    apply_anticodon = true;
                }
            }
            else {
                if (trna->IsSetAa()) {
                    if (rna.GetExt().GetTRNA().IsSetAa()) {
                        if (rna.GetExt().GetTRNA().GetAa().IsIupacaa()) {
                            if (trna->GetAa().GetIupacaa() != rna.GetExt().GetTRNA().GetAa().GetIupacaa()) {
                                ok_to_apply = false;
                            }
                        }
                    }
                    else {
                        apply_aa = true;
                    }
                }
                // look for conflict with anticodon
                if (trna->IsSetAnticodon()) {
                    if (rna.GetExt().GetTRNA().IsSetAnticodon()) {
                        if (sequence::Compare(rna.GetExt().GetTRNA().GetAnticodon(),
                            trna->GetAnticodon(), m_Scope, sequence::fCompareOverlapping) != sequence::eSame) {
                            ok_to_apply = false;
                        }
                    } else {
                        apply_anticodon = true;
                    }
                }
            }
            if (ok_to_apply) {
                if (apply_aa) {
                    rna.SetExt().SetTRNA().SetAa().SetIupacaa(trna->GetAa().GetNcbieaa());
                    ChangeMade(CCleanupChange::eChange_tRna);
                }
                if (apply_anticodon) {
                    CRef<CSeq_loc> anticodon(new CSeq_loc());
                    anticodon->Add(trna->GetAnticodon());
                    rna.SetExt().SetTRNA().SetAnticodon(*anticodon);
                    ChangeMade(CCleanupChange::eChangeAnticodon);
                }
                return eAction_Erase;
            }
        }
    }
    return eAction_Nothing;
}


CNewCleanup_imp::EAction
CNewCleanup_imp::x_ProtGBQualBC(CProt_ref& prot, const CGb_qual& gb_qual, EGBQualOpt opt)
{
    const string& qual = gb_qual.GetQual();
    const string& val = gb_qual.GetVal();

    if (NStr::EqualNocase(qual, "product") || NStr::EqualNocase(qual, "standard_name")) {
        if (opt == eGBQualOpt_CDSMode || !prot.IsSetName() || NStr::IsBlank(prot.GetName().front())) {
            CCleanup::SetProteinName(prot, val, false);
            ChangeMade(CCleanupChange::eChangeQualifiers);
        } else {
            return eAction_Nothing;
        }
    } else if (NStr::EqualNocase(qual, "function")) {
        ADD_STRING_TO_LIST(prot.SetActivity(), val);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    } else if (NStr::EqualNocase(qual, "EC_number")) {
        ADD_STRING_TO_LIST(prot.SetEc(), val);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    // labels to leave alone
    static const char * const ignored_quals[] =
    { "label", "allele", "experiment", "inference", "UniProtKB_evidence",
    "dbxref", "replace", "rpt_unit_seq", "rpt_unit_range" };
    static set<string, PNocase> ignored_quals_raw;

    // the mutex is just there in the unlikely event that two separate
    // threads both try to initialized ignored_quals_raw.  It's NOT
    // needed for reading
    static CMutex ignored_quals_raw_initialization_mutex;
    {
        CMutexGuard guard(ignored_quals_raw_initialization_mutex);
        if (ignored_quals_raw.empty()) {
            copy(ignored_quals, ignored_quals + sizeof(ignored_quals) / sizeof(ignored_quals[0]),
                inserter(ignored_quals_raw, ignored_quals_raw.begin()));
        }
    }

    if (ignored_quals_raw.find(qual) != ignored_quals_raw.end()) {
        return eAction_Nothing;
    }

    // all other gbquals not appropriate on protein features
    return eAction_Erase;
}


void CNewCleanup_imp::BioSourceEC(CBioSource& biosrc)
{
    x_AddEnvSamplOrMetagenomic(biosrc);
    if (biosrc.IsSetOrg()) {
        x_CleanupOldName(biosrc.SetOrg());
        x_CleanupOrgModNoteEC(biosrc.SetOrg());
    }
}


void CNewCleanup_imp::x_AddEnvSamplOrMetagenomic(CBioSource& biosrc)
{
    if (!biosrc.IsSetOrg()) {
        return;
    }
    auto& org = biosrc.SetOrg();
    // add environmental_sample or metagenomic based on lineage or div

    if ( org.IsSetOrgname()) {
        const auto& orgname = org.GetOrgname();
        bool needs_env_sample = false;
        bool needs_metagenomic = false;
        if (orgname.IsSetLineage()) {
            string lineage = orgname.GetLineage();
            if (NStr::FindNoCase(lineage, "environmental sample") != string::npos) {
                needs_env_sample = true;
            }
            if (NStr::FindNoCase(lineage, "metagenomes") != string::npos) {
                needs_metagenomic = true;
            }
        }
        if (orgname.IsSetDiv()
            && NStr::Equal(biosrc.GetOrg().GetOrgname().GetDiv(), "ENV")) {
            needs_env_sample = true;
        }

        if (needs_env_sample || needs_metagenomic) {
            bool has_env_sample = false;
            bool has_metagenomic = false;
            if ( biosrc.IsSetSubtype()) {
                ITERATE(CBioSource::TSubtype, it, biosrc.GetSubtype()) {
                    if ((*it)->IsSetSubtype()) {
                        if ((*it)->GetSubtype() == CSubSource::eSubtype_environmental_sample) {
                            has_env_sample = true;
                        }
                        if ((*it)->GetSubtype() == CSubSource::eSubtype_metagenomic) {
                            has_metagenomic = true;
                        }
                    }
                }
            }
            if (needs_env_sample && !has_env_sample) {
                CRef<CSubSource> s(new CSubSource(CSubSource::eSubtype_environmental_sample, ""));
                biosrc.SetSubtype().push_back(s);
                ChangeMade(CCleanupChange::eAddSubSource);
            }
            if (needs_metagenomic && !has_metagenomic) {
                CRef<CSubSource> s(new CSubSource(CSubSource::eSubtype_metagenomic, ""));
                biosrc.SetSubtype().push_back(s);
                ChangeMade(CCleanupChange::eAddSubSource);
            }
        }
    }
}


struct SRemovableOldname
{
    const string& m_Taxname;
    bool operator()(CRef<COrgMod> mod)
    {
        return (mod->IsSetSubtype() &&
            mod->GetSubtype() == COrgMod::eSubtype_old_name &&
            mod->IsSetSubname() &&
            NStr::Equal(mod->GetSubname(), m_Taxname) &&
            (!mod->IsSetAttrib() || NStr::IsBlank(mod->GetAttrib())));
    }
};


void CNewCleanup_imp::x_CleanupOldName(COrg_ref& org)
{
    if (org.IsSetTaxname() && org.IsSetOrgname() && org.GetOrgname().IsSetMod()) {
        SRemovableOldname matcher{ org.GetTaxname() };
        auto& modset = org.SetOrgname().SetMod();
        size_t before = modset.size();
        modset.erase(std::remove_if(modset.begin(), modset.end(), matcher), modset.end());
        if (before != modset.size()) {
            ChangeMade(CCleanupChange::eRemoveOrgmod);         
        }
        if (modset.empty()) {
            org.SetOrgname().ResetMod();
            ChangeMade(CCleanupChange::eRemoveOrgmod);
        }
    }
}



bool s_HasMatchingGBMod(const COrgName& org, const string& val)
{
    if (!org.IsSetMod()) {
        return false;
    }
    ITERATE(COrgName::TMod, it, org.GetMod()) {
        if ((*it)->IsSetSubtype() &&
            ((*it)->GetSubtype() == COrgMod::eSubtype_gb_acronym ||
            (*it)->GetSubtype() == COrgMod::eSubtype_gb_anamorph ||
                (*it)->GetSubtype() == COrgMod::eSubtype_gb_synonym) &&
                (*it)->IsSetSubname() &&
            NStr::Equal((*it)->GetSubname(), val)) {
            return true;
        }
    }
    return false;
}


struct SRemovableOrgModNote {
    const COrg_ref& org;
    bool operator()(CRef<COrgMod> mod) {
        return (mod->IsSetSubtype() &&
            mod->GetSubtype() == COrgMod::eSubtype_other &&
            mod->IsSetSubname() &&
            (s_HasMatchingGBMod(org.GetOrgname(), mod->GetSubname()) ||
            (org.IsSetTaxname() && NStr::Equal(org.GetTaxname(), mod->GetSubname()))));

    }
};

void CNewCleanup_imp::x_CleanupOrgModNoteEC(COrg_ref& org)
{
    if (!org.IsSetOrgname() || !org.GetOrgname().IsSetMod()) {
        return;
    }
    auto& modset = org.SetOrgname().SetMod();
    SRemovableOrgModNote matcher{ org };
    size_t before = modset.size();
    modset.erase(std::remove_if(modset.begin(), modset.end(), matcher), modset.end());
    if (before != modset.size()) {
        ChangeMade(CCleanupChange::eRemoveOrgmod);
    }
    if (modset.empty()) {
        org.SetOrgname().ResetMod();
        ChangeMade(CCleanupChange::eRemoveOrgmod);
    }
}


#if 0
void CNewCleanup_imp::x_FlattenPubEquiv(CPub_equiv& pub_equiv)
{
    CPub_equiv::Tdata& data = pub_equiv.Set();
    
    EDIT_EACH_PUB_ON_PUBEQUIV(pub_iter, pub_equiv ) {
        if( FIELD_IS(**pub_iter, Equiv) ) {
            CPub_equiv& equiv = GET_MUTABLE(**pub_iter, Equiv);
            x_FlattenPubEquiv(equiv);
            copy(equiv.Set().begin(), equiv.Set().end(), back_inserter(data));
            ERASE_PUB_ON_PUBEQUIV( pub_iter, pub_equiv );
            ChangeMade(CCleanupChange::eChangePublication);
        }
    }
}
#endif


void CNewCleanup_imp::x_DateStdBC( CDate_std& date )
{
    if ( FIELD_OUT_OF_RANGE(date, Month, 1, 12) ) {
        RESET_FIELD(date, Month);
        ChangeMade(CCleanupChange::eCleanupDate);
    }

    // Maybe we should have the max range set on a per-month basis? (e.g. 30 days for April).
    // ( This could get complex with leap years and such. )
    if ( FIELD_OUT_OF_RANGE(date, Day, 1, 31) ) {
        RESET_FIELD(date, Day);
        ChangeMade(CCleanupChange::eCleanupDate);
    }

    if ( FIELD_OUT_OF_RANGE(date, Second, 0, 59) ) {
        RESET_FIELD(date, Second);
        ChangeMade(CCleanupChange::eCleanupDate);
    }

    if (date.IsSetMinute()) {
        if (date.GetMinute() < 0 || date.GetMinute() > 59) {
            date.ResetMinute();
            date.ResetSecond();
            ChangeMade(CCleanupChange::eCleanupDate);
        }
    } else if (date.IsSetSecond()) {
        date.ResetSecond();
        ChangeMade(CCleanupChange::eCleanupDate);
    }

    if (date.IsSetHour()) {
        if (date.GetHour() < 0 || date.GetHour() > 23) {
            date.ResetHour();
            date.ResetMinute();
            date.ResetSecond();
            ChangeMade(CCleanupChange::eCleanupDate);
        }
    } else if (date.IsSetMinute() || date.IsSetSecond()) {
        date.ResetMinute();
        date.ResetSecond();
        ChangeMade(CCleanupChange::eCleanupDate);
    }

}


void CNewCleanup_imp::x_AddReplaceQual(CSeq_feat& feat, const string& str)
{
    if (!NStr::EndsWith(str, ')')) {
        return;
    }

    SIZE_TYPE start = str.find_first_of('\"');
    if (start != NPOS) {
        SIZE_TYPE end = str.find_first_of('\"', start + 1);
        if (end != NPOS) {
            string replace_val = str.substr(start + 1, (end - start) - 1);
            NStr::ToLower(replace_val);
            feat.AddQualifier("replace", replace_val );
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }
    }
}

void CNewCleanup_imp::x_SeqIntervalBC( CSeq_interval & seq_interval )
{
    // Fix backwards intervals
    if ( seq_interval.CanGetFrom()  &&  seq_interval.CanGetTo()  &&  seq_interval.GetFrom() > seq_interval.GetTo()) {
        swap(seq_interval.SetFrom(), seq_interval.SetTo());
        ChangeMade(CCleanupChange::eChangeSeqloc);
    }
    // change bad strand values.
    if (m_Scope && seq_interval.IsSetId()) {
        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq_interval.GetId());
        if (bsh) {
            if (bsh.IsProtein()) {
                if (seq_interval.IsSetStrand()) {
                    seq_interval.ResetStrand();
                    ChangeMade(CCleanupChange::eChangeStrand);
                }
            } else if (seq_interval.IsSetStrand()) {
                if (seq_interval.GetStrand() == eNa_strand_unknown) {
                    seq_interval.SetStrand(eNa_strand_plus);
                    ChangeMade(CCleanupChange::eChangeStrand);
                }
            } else {
                seq_interval.SetStrand(eNa_strand_plus);
                ChangeMade(CCleanupChange::eChangeStrand);
            }
        }
    }
}

void CNewCleanup_imp::x_BothStrandBC( CSeq_loc &loc )
{
    switch (loc.Which()) {
    case CSeq_loc::e_Int :
        x_BothStrandBC( GET_MUTABLE(loc, Int) );
        break;
    case CSeq_loc::e_Packed_int :
        {
            CSeq_loc::TPacked_int::Tdata& ints = loc.SetPacked_int().Set();
            NON_CONST_ITERATE(CSeq_loc::TPacked_int::Tdata, interval_it, ints) {
                x_BothStrandBC(**interval_it);
            }
        }
        break;
    case CSeq_loc::e_Pnt :
        {
            CSeq_loc::TPnt& pnt = loc.SetPnt();
            
            // change both and both-rev to plus and minus, respectively
            if (pnt.CanGetStrand()) {
                ENa_strand strand = pnt.GetStrand();
                if (strand == eNa_strand_both) {
                    pnt.SetStrand(eNa_strand_plus);
                    ChangeMade(CCleanupChange::eChangeStrand);
                } else if (strand == eNa_strand_both_rev) {
                    pnt.SetStrand(eNa_strand_minus);
                    ChangeMade(CCleanupChange::eChangeStrand);
                }
            }
        }
        break;

    default:
        break;
    }
}

void CNewCleanup_imp::x_BothStrandBC( CSeq_interval & seq_interval )
{
    if (seq_interval.CanGetStrand()) {
        ENa_strand strand = seq_interval.GetStrand();
        if (strand == eNa_strand_both) {
            seq_interval.SetStrand(eNa_strand_plus);
            ChangeMade(CCleanupChange::eChangeStrand);
        } else if (strand == eNa_strand_both_rev) {
            seq_interval.SetStrand(eNa_strand_minus);
            ChangeMade(CCleanupChange::eChangeStrand);
        }
    }
}

void CNewCleanup_imp::x_SplitDbtag( CDbtag &dbt, vector< CRef< CDbtag > > & out_new_dbtags )
{
    // check the common case of nothing to split
    if (!dbt.IsSetTag()) {
        return;
    }
    auto& tag = dbt.SetTag();
    if (!tag.IsStr()) {
        return;
    }
    if( tag.GetStr().find(":") == string::npos ) {
        return;
    }

    // check if we're trying to split something we shouldn't
    if (dbt.IsSetDb()) {
        string db = dbt.GetDb();
        if (NStr::Equal(db, "MGD") || NStr::Equal(db, "MGI") || NStr::Equal(db, "HGNC") || NStr::Equal(db, "VGNC")) {
            return;
        }
    }

    if ( m_IsEmblOrDdbj) {
        return;
    }

    // split by colon and generate new tags
    vector<string> tags;
    NStr::Split(tag.GetStr(), ":", tags, NStr::fSplit_Tokenize);
    _ASSERT( tags.size() >= 2 );

    // treat the CDbtag argument as the first of the new CDbtags
    tag.SetStr( tags.front() );
    vector<string>::const_iterator str_iter = tags.begin() + 1;
    for( ; str_iter != tags.end(); ++str_iter ) {
        CRef<CDbtag> new_tag( new CDbtag );
        new_tag->Assign( dbt );
        new_tag->SetTag().SetStr( *str_iter );
        out_new_dbtags.push_back( new_tag );
    }

    ChangeMade(CCleanupChange::eCleanDbtag);
}

inline
static
bool s_CodonCompare( const int &codon1, const int &codon2 ) {
    return (codon1 < codon2);
}

inline
static
bool s_CodonEqual( int codon1, int codon2 ) {
    return (codon1 == codon2);
}

static
char s_ConvertTrnaAaToLetter( const CTrna_ext::C_Aa &trna_aa, CSeqUtil::ECoding coding, char *out_aa_char = NULL )
{
    char temp_aa = '\0';

    size_t num_converted = 0;
    char new_aa = '\0';
    switch( trna_aa.Which() ) {
    case CTrna_ext::C_Aa::e_Iupacaa:
        temp_aa = trna_aa.GetIupacaa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Iupacaa, 0, 1, &new_aa, coding );
        break;
    case CTrna_ext::C_Aa::e_Ncbieaa:
        temp_aa = trna_aa.GetNcbieaa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbieaa, 0, 1, &new_aa, coding );
        break;
    case CTrna_ext::C_Aa::e_Ncbi8aa:
        temp_aa = trna_aa.GetNcbi8aa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbi8aa, 0, 1, &new_aa, coding );
        break;
    case CTrna_ext::C_Aa::e_Ncbistdaa:
        temp_aa = trna_aa.GetNcbistdaa();
        num_converted = CSeqConvert::Convert( &temp_aa, CSeqUtil::e_Ncbistdaa, 0, 1, &new_aa, coding );
        break;
    default:
        break;
    }
    if( NULL != out_aa_char ) {
        *out_aa_char = temp_aa;
    }
    if( num_converted > 0 ) {
        return new_aa;
    } else {
        return '\0';
    }
}

void CNewCleanup_imp::x_SeqFeatTRNABC( CSeq_feat& feat, CTrna_ext & tRNA )
{
    if( tRNA.IsSetAa() && tRNA.GetAa().IsIupacaa() ) {
        const int old_value = tRNA.GetAa().GetIupacaa();
        tRNA.SetAa().SetNcbieaa( old_value );
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    if (! CODON_ON_TRNAEXT_IS_SORTED(tRNA, s_CodonCompare)) {
        SORT_CODON_ON_TRNAEXT(tRNA, s_CodonCompare);
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    if( ! CODON_ON_TRNAEXT_IS_UNIQUE(tRNA, s_CodonEqual) ) {
        UNIQUE_CODON_ON_TRNAEXT(tRNA, s_CodonEqual);
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    REMOVE_IF_EMPTY_CODON_ON_TRNAEXT(tRNA);
}

static
void s_ParsePCRComponent(vector<string> &out_list, const string *component)
{
    out_list.clear();

    if( component == NULL ) return;
    if ( component->empty() ) return;

    string component_copy = *component; //copy so we can modify it
    // Remove enclosing parens, if any
    const string::size_type len = component_copy.length();
    if ( len > 1 && component_copy[0] == '(' && component_copy[len - 1] == ')' && component_copy.find('(', 1) == string::npos ) {
        component_copy = component_copy.substr( 1, component_copy.length() - 2 );
    }

    NStr::Split(component_copy, string(","), out_list, NStr::fSplit_Tokenize);
    EDIT_EACH_STRING_IN_VECTOR( str_iter, out_list ) {
        NStr::TruncateSpacesInPlace( *str_iter );
    }
}

class CPCRParsedSet {
public:
    CPCRParsedSet( 
        const string * fwd_seq,
        const string * rev_seq,
        const string * fwd_name,
        const string * rev_name ) :
    m_Fwd_seq(      fwd_seq  == NULL ? kEmptyStr : *fwd_seq),
        m_Rev_seq(  rev_seq  == NULL ? kEmptyStr : *rev_seq ),
        m_Fwd_name( fwd_name == NULL ? kEmptyStr : *fwd_name ),
        m_Rev_name( rev_name == NULL ? kEmptyStr : *rev_name ),
        m_Original_order( ms_Next_original_order.Add(1) ) { }

    const string &GetFwdSeq() const { return m_Fwd_seq; }
    const string &GetRevSeq() const { return m_Rev_seq; }
    const string &GetFwdName() const { return m_Fwd_name; }
    const string &GetRevName() const { return m_Rev_name; }

    bool operator <( const CPCRParsedSet &rhs ) const {
        if ( int diff = NStr::CompareNocase( m_Fwd_seq, rhs.m_Fwd_seq ) )
            return diff < 0;
        if ( int diff = NStr::CompareNocase( m_Rev_seq, rhs.m_Rev_seq ) )
            return diff < 0;
        if ( int diff = NStr::CompareNocase( m_Fwd_name, rhs.m_Fwd_name ) )
            return diff < 0;
        if ( int diff = NStr::CompareNocase( m_Rev_name, rhs.m_Rev_name ) )
            return diff < 0;
        // last resort
        return m_Original_order < rhs.m_Original_order;
    }

private:
    string m_Fwd_seq;
    string m_Rev_seq;
    string m_Fwd_name;
    string m_Rev_name;
    CAtomicCounter::TValue m_Original_order;

    static CAtomicCounter ms_Next_original_order;
};

CAtomicCounter CPCRParsedSet::ms_Next_original_order;

static
void s_ParsePCRSet( const CBioSource &biosrc, list<CPCRParsedSet> &out_pcr_set )
{
    out_pcr_set.clear();

    const string* fwd_primer_seq = NULL;
    const string* rev_primer_seq = NULL;
    const string* fwd_primer_name = NULL;
    const string* rev_primer_name = NULL;

// convenience macro
#define PARSEPCRSET_CASE(Subtype) \
            case NCBI_SUBSOURCE(Subtype): \
            if( (*subsrc_iter)->IsSetName() ) { \
                Subtype = &((*subsrc_iter)->GetName()); \
            } \
            break;


    FOR_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        SWITCH_ON_SUBSOURCE_CHOICE( **subsrc_iter ) {
        PARSEPCRSET_CASE(fwd_primer_seq)
        PARSEPCRSET_CASE(rev_primer_seq)
        PARSEPCRSET_CASE(fwd_primer_name)
        PARSEPCRSET_CASE(rev_primer_name)
        default:
            // ignore
            break;
        }
    }
#undef PARSEPCRSET_CASE

    // ParsePCRStrings 
    vector<string> fwd_seq_list;
    s_ParsePCRComponent(fwd_seq_list, fwd_primer_seq);
    vector<string> rev_seq_list;
    s_ParsePCRComponent(rev_seq_list, rev_primer_seq);
    vector<string> fwd_name_list;
    s_ParsePCRComponent(fwd_name_list, fwd_primer_name);
    vector<string> rev_name_list;
    s_ParsePCRComponent(rev_name_list, rev_primer_name);

    vector<string>::iterator curr_fwd_seq = fwd_seq_list.begin();
    vector<string>::iterator curr_rev_seq = rev_seq_list.begin();
    vector<string>::iterator curr_fwd_name = fwd_name_list.begin();
    vector<string>::iterator curr_rev_name = rev_name_list.begin();

    while (curr_fwd_seq != fwd_seq_list.end() || 
        curr_rev_seq != rev_seq_list.end()    || 
        curr_fwd_name != fwd_name_list.end()  || 
        curr_rev_name != rev_name_list.end() ) 
    {
        const string *fwd_seq = ( curr_fwd_seq != fwd_seq_list.end() ? &*curr_fwd_seq++ : NULL );
        const string *rev_seq = ( curr_rev_seq != rev_seq_list.end() ? &*curr_rev_seq++ : NULL );
        const string *fwd_name = ( curr_fwd_name != fwd_name_list.end() ? &*curr_fwd_name++ : NULL );
        const string *rev_name = ( curr_rev_name != rev_name_list.end() ? &*curr_rev_name++ : NULL );

        out_pcr_set.push_back( CPCRParsedSet(fwd_seq, rev_seq, fwd_name, rev_name) );
    }
}

// split by colon and trim spaces off the pieces
static
void s_ParsePCRColonString( vector<string> &out_list, const string &str ) 
{
    NStr::Split(str, ":", out_list, NStr::fSplit_Tokenize);
    EDIT_EACH_STRING_IN_VECTOR(str_iter, out_list ) {
        NStr::TruncateSpacesInPlace( *str_iter );
        if( str_iter->empty() ) {
            ERASE_STRING_IN_VECTOR(str_iter, out_list);
        }
    }    
}

static 
CRef<CPCRPrimerSet> s_ModernizePCRPrimerHalf (const string &seq, const string &name)
{
    // Construct the value we will return
    // ( and extract its primer set for easy access )
    CRef<CPCRPrimerSet> return_value( new CPCRPrimerSet );
    list< CRef< CPCRPrimer > > &primer_list = return_value->Set();

    vector<string> seq_list;
    s_ParsePCRColonString (seq_list, seq);
    vector<string> name_list;
    s_ParsePCRColonString (name_list, name);

    vector<string>::const_iterator name_iter = name_list.begin();

    CRef<CPCRPrimer> last_primer;

    // create a PCRPrimer for each seq (and attach its name, if possible)
    FOR_EACH_STRING_IN_VECTOR( seq_iter, seq_list ) {

        const string *curr_name = NULL;
        if ( name_iter != name_list.end() ) {
            curr_name = &*name_iter;
            ++name_iter;
        }

        CRef<CPCRPrimer> curr_primer( new CPCRPrimer );
        curr_primer->SetSeq().Set( *seq_iter );
        if( curr_name != NULL ) {
            curr_primer->SetName().Set( *curr_name );
        }
        primer_list.push_back( curr_primer );
        last_primer = curr_primer;
    }

    if( last_primer ) {
        // attach any leftover names to the end of the name of the last seq
        for ( ; name_iter != name_list.end() ; ++name_iter ) {
            last_primer->SetName().Set() += ":" + *name_iter;
        }
    } else {
        // This differs from C.  C breaks as soon as it's looked at the
        // first name, but this version will create CPCRPrimer for all names.
        for ( ; name_iter != name_list.end() ; ++name_iter ) {
            CRef<CPCRPrimer> curr_primer( new CPCRPrimer );
            curr_primer->SetName().Set( *name_iter );
            primer_list.push_back( curr_primer );
        }
    }

    // If the CPCRPrimerSet contains nothing inside, return a null ref
    if( primer_list.empty() ) {
        return CRef<CPCRPrimerSet>();
    } else {
        return return_value;
    }
}

class CIsBadCRefPCRSubSource {
public:
    bool operator()( const CRef<CSubSource> &subsource ) const {
        if( ! subsource ) {
            return true;
        }

        SWITCH_ON_SUBSOURCE_CHOICE( *subsource ) {
        case NCBI_SUBSOURCE(fwd_primer_seq):
        case NCBI_SUBSOURCE(rev_primer_seq):
        case NCBI_SUBSOURCE(fwd_primer_name):
        case NCBI_SUBSOURCE(rev_primer_name):
            return true;
        }

        return false;
    }
};

void CNewCleanup_imp::x_ModernizePCRPrimers( CBioSource &biosrc )
{
    list<CPCRParsedSet> pcr_parsed_list;
    s_ParsePCRSet( biosrc, pcr_parsed_list );
    if( pcr_parsed_list.empty() ) {
        return;
    }

    CRef<CPCRReactionSet> pcr_reaction_set( new CPCRReactionSet );
    list< CRef< CPCRReaction > > &pcr_reaction_list = pcr_reaction_set->Set();

    FOR_EACH_PCRPARSEDSET_IN_LIST( pcr_parsed_list_iter, pcr_parsed_list) {

        CRef<CPCRPrimerSet> forward = 
            s_ModernizePCRPrimerHalf (pcr_parsed_list_iter->GetFwdSeq(), 
            pcr_parsed_list_iter->GetFwdName());
        CRef<CPCRPrimerSet> reverse = 
            s_ModernizePCRPrimerHalf (pcr_parsed_list_iter->GetRevSeq(), 
            pcr_parsed_list_iter->GetRevName());

        if ( forward || reverse ) {
            CRef<CPCRReaction> curr_reaction( new CPCRReaction );
            if( forward ) {
                SET_FIELD( *curr_reaction, Forward, *forward );
            }
            if( reverse ) {
                SET_FIELD( *curr_reaction, Reverse, *reverse );
            }
            pcr_reaction_list.push_back( curr_reaction );
        }
    }

    // only add PCR reaction set if there's something in it
    if ( ! pcr_reaction_list.empty() ) {

        // copy the existing reaction set (if any) to the end of ours
        copy( GET_MUTABLE(biosrc, Pcr_primers).Set().begin(), 
            GET_MUTABLE(biosrc, Pcr_primers).Set().end(), 
            back_inserter(pcr_reaction_list) );
        // we are now the real pcr reaction set
        SET_FIELD( biosrc, Pcr_primers, *pcr_reaction_set );
        ChangeMade(CCleanupChange::eChangePCRPrimers);

        PCRReactionSetBC( GET_MUTABLE(biosrc, Pcr_primers) );

        // remove all old-style PCR primer subsources ( fwd_primer_seq, etc. ) 
        if( FIELD_IS_SET(biosrc, Subtype) ) {
            list< CRef< CSubSource > > &subsources = GET_MUTABLE(biosrc, Subtype);
            list< CRef< CSubSource > >::iterator first_bad_element = 
                remove_if( subsources.begin(), subsources.end(), CIsBadCRefPCRSubSource() );
            if( first_bad_element != subsources.end() ) {
                subsources.erase( first_bad_element, subsources.end() );
                ChangeMade(CCleanupChange::eChangeSubsource);
            }

            REMOVE_IF_EMPTY_SUBSOURCE_ON_BIOSOURCE(biosrc);
        }
    }
}

static
void s_SplitAtSingleTildes( list<string> &piece_vec, const string &str )
{
    if( str.empty() ) {
        return;
    }

    // piece_start is the beginning of the piece we're working on,
    // but search_start is where to start looking for tildes on this iteration
    // ( invariant: search_pos >= piece_start_pos )
    string::size_type piece_start_pos = 0;
    string::size_type search_pos = 0;
    while( search_pos < str.length() ) {
        // find the next tilde
        string::size_type tilde_pos = str.find_first_of("~", search_pos);
        if( string::npos == tilde_pos ) {
            tilde_pos = str.length();
        }

        // can we use the tilde as a place to split?
        const bool tilde_is_usable = (
            ( tilde_pos == 0 || str[tilde_pos-1] != ' ' ) &&
            ( tilde_pos >= (str.length()-1) || str[tilde_pos+1] != '~' ) );

        if( tilde_is_usable ) {
            // Great, so split at the tilde, and add the new piece
            piece_vec.push_back( str.substr(piece_start_pos, tilde_pos - piece_start_pos) );
            // trim spaces and remove if trimmed to nothing
            NStr::TruncateSpacesInPlace( piece_vec.back() );
            if( piece_vec.back().empty() ) {
                piece_vec.resize( piece_vec.size() - 1 );
            }
        }

        // skip any tildes after our tilde, regardless of whether it was usable
        search_pos = tilde_pos;
        while( search_pos < str.length() && str[search_pos] == '~' ) {
            ++search_pos;
        }
        
        if( tilde_is_usable ) {
            // begin a new section
            piece_start_pos = search_pos;
        }
    }

    // add the last piece
    piece_vec.push_back( str.substr(piece_start_pos) );

    // trim spaces and remove if trimmed to nothing
    NStr::TruncateSpacesInPlace( piece_vec.back() );
    if( piece_vec.back().empty() ) {
        piece_vec.resize( piece_vec.size() - 1 );
    }
}

typedef map< TORGMOD_SUBTYPE, set<string> >    TExistingOrgModMap;
typedef map< TSUBSOURCE_SUBTYPE, set<string> > TExistingSubsourceMap;

// returns true if subname was changed
static
bool s_CleanupOrgModAndSubSourceOther_helper(
    string &subname, 
    const TExistingOrgModMap &existingOrgModMap, 
    const TExistingSubsourceMap &existingSubsourceMap )
{
    list<string> subname_piece_vec;
    s_SplitAtSingleTildes( subname_piece_vec, subname );

    if( subname_piece_vec.empty() ) {
        if( subname.empty() ) {
            return false;
        } else {
            subname.clear();
            return true;
        }
    }

    // check if any pieces are duplicated elsewhere
    list<string>::iterator piece_iter = subname_piece_vec.begin();
    while( piece_iter != subname_piece_vec.end() ) {
        string &piece = (*piece_iter);
        bool should_erase_piece = false;

        string::size_type val_start_pos = 0;
        TORGMOD_SUBTYPE orgmod_subtype = NCBI_ORGMOD(other);
        TSUBSOURCE_SUBTYPE subsrc_subtype = NCBI_SUBSOURCE(other);
        if( s_StringHasOrgModPrefix(piece, val_start_pos, orgmod_subtype) ) {
            string val = piece.substr(val_start_pos);

            TExistingOrgModMap::const_iterator orgmodmap_iter = 
                existingOrgModMap.find(orgmod_subtype);
            if( orgmodmap_iter != existingOrgModMap.end() ) {
                const set<string> &valsAlreadyThere = orgmodmap_iter->second;
                if( valsAlreadyThere.find(val) != valsAlreadyThere.end() ) {
                    // already exists, so should be removed
                    should_erase_piece = true;
                }
            }
        } else if( s_StringHasSubSourcePrefix(piece, val_start_pos, subsrc_subtype) ) {
            string val = piece.substr(val_start_pos);

            TExistingSubsourceMap::const_iterator subsrcmap_iter =
                existingSubsourceMap.find(subsrc_subtype);
            if( subsrcmap_iter != existingSubsourceMap.end() ) {
                const set<string> &valsAlreadyThere = subsrcmap_iter->second;
                if( valsAlreadyThere.find(val) != valsAlreadyThere.end() ) {
                    // already exists, so should be removed
                    should_erase_piece = true;
                }
            }
        }

        if( should_erase_piece ) {
            piece_iter = subname_piece_vec.erase(piece_iter);
        } else {
            ++piece_iter;
        }
    }

    string new_subname = NStr::Join( subname_piece_vec, "~" );
    if( subname != new_subname ) {
        // swap is faster than assignment
        subname.swap( new_subname );
        return true;
    } else {
        return false;
    }
}

void CNewCleanup_imp::x_CleanupOrgModAndSubSourceOther( COrgName &orgname, CBioSource &biosrc )
{
    // Load each orgmod and subsource into a map for later retrievable
    // ( More efficient than C's quadratic loop-in-a-loop for bigger cases )

    TExistingOrgModMap existingOrgModMap;
    FOR_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        const COrgMod &org_mod = **orgmod_iter;
        if( FIELD_IS_SET(org_mod, Subtype) && 
            GET_FIELD(org_mod, Subtype) != NCBI_ORGMOD(other) )
        {
            const string &val = GET_STRING_FLD_OR_BLANK(org_mod, Subname);
            existingOrgModMap[GET_FIELD(org_mod, Subtype)].insert( val );
        }
    }

    TExistingSubsourceMap existingSubsourceMap;
    EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        const CSubSource &subsrc = **subsrc_iter;
        if( FIELD_IS_SET(subsrc, Subtype) && 
            GET_FIELD(subsrc, Subtype) != NCBI_SUBSOURCE(other) )
        {
            const string &val = GET_STRING_FLD_OR_BLANK(subsrc, Name);
            existingSubsourceMap[GET_FIELD(subsrc, Subtype)].insert( val );
        }
    }

    // edit orgmods of type "other"

    EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        COrgMod &org_mod = **orgmod_iter;

        // we're only cleaning the ones of type "other"
        if( ! FIELD_EQUALS(org_mod, Subtype, NCBI_ORGMOD(other) ) ||
            ! FIELD_IS_SET(org_mod, Subname) )
        {
            continue;
        }

        string &subname = GET_MUTABLE( org_mod, Subname );
        if( s_CleanupOrgModAndSubSourceOther_helper( subname, existingOrgModMap, existingSubsourceMap ) ) {
            ChangeMade(CCleanupChange::eChangeOrgmod);
        }

        if( subname.empty() ) {
            ERASE_ORGMOD_ON_ORGNAME(orgmod_iter, orgname);
            ChangeMade(CCleanupChange::eRemoveOrgmod);
        }
    }

    // edit subsources of type "other"

    EDIT_EACH_SUBSOURCE_ON_BIOSOURCE( subsrc_iter, biosrc ) {
        CSubSource &subsrc = **subsrc_iter;

        // we're only cleaning the ones of type "other"
        if( ! FIELD_EQUALS(subsrc, Subtype, NCBI_SUBSOURCE(other) ) ||
            ! FIELD_IS_SET(subsrc, Name) ) 
        {
            continue;
        }

        string &name = GET_MUTABLE( subsrc, Name );
        if( s_CleanupOrgModAndSubSourceOther_helper( name, existingOrgModMap, existingSubsourceMap ) ) {
            ChangeMade(CCleanupChange::eChangeSubsource);
        }

        if( name.empty() ) {
            ERASE_SUBSOURCE_ON_BIOSOURCE(subsrc_iter, biosrc);
            ChangeMade(CCleanupChange::eRemoveSubSource);
        }
    }
}


// As requested in SQD-4021:
// * if strain begins with "serovar " move remaining text to a serovar 
// qualifier, unless a serovar qualifier is already present, in which case add
// to note
// * if strain begins with "subsp. " move remaining text to a subspecies
// qualifier, unless a subspecies qualifier is already present, in which case
// add to note
static const string kSubsp = "subsp. ";
static const string kSerovar = "serovar ";
struct SRemoveNamedStrain {
    bool operator()(CRef<COrgMod> m) {
        bool rval = false;
        if (m->IsSetSubtype() && m->IsSetSubname()) {
            auto subtype = m->GetSubtype();
            auto subname = m->GetSubname();
            if (subtype == COrgMod::eSubtype_serovar) {
                if (NStr::StartsWith(subname, kSubsp)) {
                    rval = true;
                }
            } else if (subtype == COrgMod::eSubtype_strain) {
                if (NStr::StartsWith(subname, kSubsp) || NStr::StartsWith(subname, kSerovar)) {
                    rval = true;
                }
            }

        }
        return rval;
    }
};

void CNewCleanup_imp::x_MovedNamedValuesInStrain(COrgName& orgname)
{
    if (!orgname.IsSetMod()) {
        return;
    }
    auto& mods = orgname.SetMod();
    for (auto m : mods) {
        if (m->IsSetSubtype() && m->IsSetSubname()) {
            switch (m->GetSubtype()) {
            case COrgMod::eSubtype_serovar:
                if (NStr::StartsWith(m->GetSubname(), kSubsp)) {
                    string val = m->GetSubname().substr(kSubsp.length());
                    x_MovedNamedValuesInStrain(orgname, COrgMod::eSubtype_sub_species, val);
                }
                break;
            case COrgMod::eSubtype_strain:
                if (NStr::StartsWith(m->GetSubname(), kSubsp)) {
                    string val = m->GetSubname().substr(kSubsp.length());
                    x_MovedNamedValuesInStrain(orgname, COrgMod::eSubtype_sub_species, val);
                }
                else if (NStr::StartsWith(m->GetSubname(), kSerovar)) {
                    string val = m->GetSubname().substr(kSerovar.length());
                    x_MovedNamedValuesInStrain(orgname, COrgMod::eSubtype_serovar, val);
                }
                break;
            default:
                break;
            }
        }
    }
    SRemoveNamedStrain matcher;
    mods.erase(std::remove_if(mods.begin(), mods.end(), matcher), mods.end());
}


void CNewCleanup_imp::x_MovedNamedValuesInStrain(COrgName& orgname, COrgMod::ESubtype stype, const string& value)
{
    bool add_val = true;
    bool add_note = false;
    ITERATE(COrgName::TMod, m, orgname.GetMod()) {
        if ((*m)->IsSetSubtype() && (*m)->GetSubtype() == stype) {
            if ((*m)->IsSetSubname() && NStr::Equal((*m)->GetSubname(), value)) {
                // already there, can just remove it
                add_note = false;
                add_val = false;
                break;
            } else {
                add_note = true;
            }
        }
    }
    if (add_val) {
        orgname.SetMod().push_back(CRef<COrgMod>(new COrgMod(stype, value)));
        ChangeMade(CCleanupChange::eAddOrgMod);
    } else if (add_note) {
        orgname.SetMod().push_back(CRef<COrgMod>(new COrgMod(COrgMod::eSubtype_other, value)));
        ChangeMade(CCleanupChange::eAddOrgMod);
    }
}


void
CNewCleanup_imp::x_OrgnameModBC( COrgName &orgname, const string &org_ref_common )
{
    if( ! FIELD_IS_SET(orgname, Mod) ) {
        return;
    }

    COrgMod *prev = NULL;

    EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        COrgMod &orgmod = **orgmod_iter;

        bool unlink = false;

        CLEAN_AND_COMPRESS_STRING_MEMBER(orgmod, Subname);
        CLEAN_AND_COMPRESS_STRING_MEMBER(orgmod, Attrib);

        const TORGMOD_SUBTYPE subtype = GET_FIELD(orgmod, Subtype);
        const string &subname = GET_FIELD(orgmod, Subname);

        if ( (subtype == NCBI_ORGMOD(common)) && 
            NStr::EqualNocase(subname, org_ref_common) )
        {
            // if you find this code commented out for a long, long time, you can probably
            // just remove it.  (originally commented-out under JIRA SQD-816)
            //// unlink = true;
        } else if( prev != NULL ) {
            const TORGMOD_SUBTYPE prev_subtype = GET_FIELD(*prev, Subtype);
            const string &prev_subname = GET_FIELD(*prev, Subname);

            if( subname.empty() ) {
                unlink = true;
            } else if (prev_subtype == subtype &&
                prev_subtype == NCBI_ORGMOD(other) &&
                NStr::Find (subname, prev_subname) != NPOS ) 
            {
                prev->Assign( orgmod );
                unlink = true;
            }
        } else if ( subname.empty() ||
            subname == ")"  ||
            subname == "(" )
        {
            unlink = true;
        }

        if (unlink) {
            ERASE_ORGMOD_ON_ORGNAME(orgmod_iter, orgname);
            ChangeMade(CCleanupChange::eRemoveOrgmod);
        } else {
            prev = &**orgmod_iter;
        }
    }

    x_MovedNamedValuesInStrain(orgname);

    COrgMod *omp_anamorph = NULL;
    COrgMod *omp_gb_anamorph = NULL;
    COrgMod *omp_other = NULL;

    EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
        const TORGMOD_SUBTYPE subtype = GET_FIELD(**orgmod_iter, Subtype);
        switch( subtype ) {
        case NCBI_ORGMOD(anamorph):
            omp_anamorph = &**orgmod_iter;
            break;
        case NCBI_ORGMOD(gb_anamorph):
            omp_gb_anamorph = &**orgmod_iter;
            break;
        case NCBI_ORGMOD(other):
            omp_other = &**orgmod_iter;
            break;
        }
    }

    bool redund = false;

    static const string kAnamorph = "anamorph:";
    if ( (omp_other != NULL) && NStr::StartsWith(GET_FIELD(*omp_other, Subname), kAnamorph, NStr::eNocase) ) {

        // This part is just to set anamorph_value to the part of the subname
        // after "anamorph:" and spaces.
        const SIZE_TYPE after_anamorph_pos = kAnamorph.length();
        SIZE_TYPE after_anamorph_pos_and_spaces = 
            GET_FIELD(*omp_other, Subname).find_first_not_of(" ", after_anamorph_pos);
        if( after_anamorph_pos_and_spaces == NPOS ) {
            after_anamorph_pos_and_spaces = after_anamorph_pos;
        }
        const string anamorph_value = GET_FIELD(*omp_other, Subname).substr(after_anamorph_pos_and_spaces);

        if (omp_anamorph != NULL) {
            if ( GET_FIELD(*omp_anamorph, Subname) == anamorph_value ) {
                redund = true;
            }
        } else if (omp_gb_anamorph != NULL) {
            if ( GET_FIELD(*omp_gb_anamorph, Subname) == anamorph_value ) {
                redund = true;
            }
        }
    }
    if(redund) {
        // remove omp_other
        EDIT_EACH_ORGMOD_ON_ORGNAME( orgmod_iter, orgname ) {
            if( &**orgmod_iter == omp_other ) {
                ERASE_ORGMOD_ON_ORGNAME(orgmod_iter, orgname);
                ChangeMade(CCleanupChange::eRemoveOrgmod);
                break;
            }
        }
    }
}

void CNewCleanup_imp::x_SubSourceBC(CSubSource & subsrc)
{
    if (!subsrc.IsSetSubtype() || !subsrc.IsSetName()) {
        return;
    }
    string orig = subsrc.GetName();
    subsrc.AutoFix();
    if (!NStr::Equal(subsrc.GetName(), orig)) {
        ChangeMade(CCleanupChange::eChangeSubsource);
    }
}

void CNewCleanup_imp::x_OrgModBC(COrgMod & orgmod)
{
    if (!orgmod.IsSetSubtype() || !orgmod.IsSetSubname()) {
        return;
    }
    string orig = orgmod.GetSubname();
    orgmod.AutoFix();
    if (!NStr::Equal(orgmod.GetSubname(), orig)) {
        ChangeMade(CCleanupChange::eChangeOrgmod);
    }
}

void CNewCleanup_imp::x_FixUnsetMolFromBiomol( CMolInfo& molinfo, CBioseq &bioseq )
{
    if( FIELD_IS_SET(molinfo, Biomol) ) 
    {
        const TMOLINFO_BIOMOL biomol = GET_FIELD(molinfo, Biomol);
        if( biomol == NCBI_BIOMOL(unknown) ) {
            RESET_FIELD( molinfo, Biomol );
            ChangeMade(CCleanupChange::eChangeMolInfo);
            return;
        }

        if( FIELD_IS_SET(bioseq, Inst) )
        {
            const TSEQ_MOL mol = ( FIELD_IS_SET(bioseq.GetInst(), Mol) ? 
                GET_FIELD(bioseq.GetInst(), Mol) :
                NCBI_SEQMOL(not_set) );
            
            if( mol == NCBI_SEQMOL(not_set) ) {
                switch( biomol ) {
                case NCBI_BIOMOL(genomic):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(na) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(pre_RNA):
                case NCBI_BIOMOL(mRNA):
                case NCBI_BIOMOL(rRNA):
                case NCBI_BIOMOL(tRNA):
                case NCBI_BIOMOL(snRNA):
                case NCBI_BIOMOL(scRNA):
                case NCBI_BIOMOL(cRNA):
                case NCBI_BIOMOL(snoRNA):
                case NCBI_BIOMOL(transcribed_RNA):
                case NCBI_BIOMOL(ncRNA):
                case NCBI_BIOMOL(tmRNA):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(rna) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(peptide):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(aa) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(other_genetic):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(other) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                case NCBI_BIOMOL(genomic_mRNA):
                    SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(na) );
                    ChangeMade(CCleanupChange::eChangeBiomol);
                    break;
                default:
                    break;
                }
            } else if( mol != NCBI_SEQMOL(rna) && 
                ( biomol == NCBI_BIOMOL(cRNA) || biomol == NCBI_BIOMOL(mRNA) ) ) 
            {
                SET_FIELD( bioseq.SetInst(), Mol, NCBI_SEQMOL(rna) );
                ChangeMade(CCleanupChange::eChangeBiomol);
            }
        }
    }
}


void CNewCleanup_imp::x_AddPartialToProteinTitle( CBioseq &bioseq )
{
    if (CCleanup::AddPartialToProteinTitle(bioseq)) {
        ChangeMade(CCleanupChange::eCleanBioseqTitle);
    }
}

// returns empty string if there's a problem
string CNewCleanup_imp::x_ExtractSatelliteFromComment( string &comment )
{
    if( comment.empty() ) {
        return kEmptyStr;
    }

    string satellite_type;
    if ( NStr::StartsWith(comment, "microsatellite") ) { 
        satellite_type = "microsatellite";
    } else if ( NStr::StartsWith (comment, "minisatellite") ) {
        satellite_type = "minisatellite";
    } else if ( NStr::StartsWith (comment, "satellite") ) {
        satellite_type = "satellite";
    } else {
        return kEmptyStr;
    }

    string satellite_qual; // the answer
    if ( comment.length() == satellite_type.length() ) {
        comment.clear();
        ChangeMade(CCleanupChange::eRemoveComment);
        return satellite_type;
    } else if (comment[satellite_type.length()] == ';') {
        satellite_qual = satellite_type;
        comment = comment.substr( satellite_type.length() + 1 );
        NStr::TruncateSpacesInPlace(comment);
        ChangeMade(CCleanupChange::eChangeComment);
    }
    if ( comment [0] == '~' && comment [1] != '~') {
        comment [0] = ' ';
        NStr::TruncateSpacesInPlace(comment);
        ChangeMade(CCleanupChange::eChangeComment);
    }

    return satellite_qual;
}


void CNewCleanup_imp::x_CleanupECNumber( string &ec_num )
{
    const string::size_type original_ec_num_length = ec_num.length();
    NStr::TruncateSpacesInPlace( ec_num );

    // remove any unnecessary "EC " prefix
    s_RemoveInitial( ec_num, "EC ", NStr::eNocase );
    s_RemoveInitial( ec_num, "EC:", NStr::eNocase );

    // remove trailing punctuation: 
    // 1. periods unless they are preceded by a digit, an 'n', or a '-'
    // 2. dashes unless they are preceded by a period
    // 3. all other trailing punctuation always

    string::reverse_iterator s1 = ec_num.rbegin();
    if (s1 != ec_num.rend() && (ispunct(*s1) || isspace(*s1))) {
        string::reverse_iterator s2 = s1;
        ++s2;
        while (s2 != ec_num.rend() && (ispunct(*s1) || isspace(*s1))) {
            bool do_erase = false;
            if (isspace(*s1)) {
                do_erase = true;
            } else if (*s1 == '-') {
                if (*s2 != '.') {
                    do_erase = true;
                }
            } else {
                do_erase = true;
            }
            if (do_erase) {
                ec_num = ec_num.substr(0, ec_num.length() - 1);
                s1 = ec_num.rbegin();
                if (s1 == ec_num.rend()) {
                    break;
                }
                s2 = s1;
                ++s2;
            } else {
                break;
            }
        }
    }


    if( ec_num.length() != original_ec_num_length ) {
        ChangeMade(CCleanupChange::eCleanECNumber);
    }
}

static bool s_ECNumberCanBeSplit( const string & ec_num )
{
    // check if string is non-empty and contains no forbidden characters
    if( ec_num.empty() ) {
        return false;
    }
    return ( ec_num.find_first_not_of("0123456789.-n ;") == string::npos );
}

void CNewCleanup_imp::x_CleanupECNumberList( CProt_ref::TEc & ec_num_list )
{
    // CProt_ref::TEc is a list, so the iterator stays valid even if we 
    // add new entries after the current one
    NON_CONST_ITERATE( CProt_ref::TEc, ec_num_iter, ec_num_list ) {
        string & ec_num = *ec_num_iter;
        x_CleanupECNumber( ec_num );
        if( s_ECNumberCanBeSplit(ec_num) ) {
            // if there are any, split at first ' ' or ';'
            string::size_type split_pos = ec_num.find_first_of(" ;");
            if( split_pos != string::npos ) {
                string new_ec_num = ec_num.substr( split_pos + 1 );
                ec_num.resize( split_pos );
                CProt_ref::TEc::iterator next_ec_num_iter = ec_num_iter;
                ++next_ec_num_iter;
                ec_num_list.insert( next_ec_num_iter, new_ec_num );
                ChangeMade(CCleanupChange::eCleanECNumber);
            }
        }
    }
}

void CNewCleanup_imp::x_CleanupECNumberListEC( CProt_ref::TEc & ec_num_list )
{
    if (CCleanup::UpdateECNumbers(ec_num_list)) {
        ChangeMade(CCleanupChange::eCleanECNumber);
    }
}

void CNewCleanup_imp::x_CleanupAndRepairInference( string &inference )
{
    if( inference.empty() ) {
        return;
    }

    const string original_inference = inference;
    inference = CGb_qual::CleanupAndRepairInference( original_inference );

    if( inference != original_inference ) {
        ChangeMade(CCleanupChange::eCleanQualifiers);
    }
}

void CNewCleanup_imp::x_MendSatelliteQualifier( string &val )
{
    if ( val.empty() ){
        return;
    }

    CCachedRegexp prefixRegexp = regexpCache.Get("^(micro|mini|)satellite");
    if( prefixRegexp->IsMatch(val) ) {
        SIZE_TYPE spot_just_after_match = prefixRegexp->GetResults(0)[1];
        if( spot_just_after_match < val.length() && 
            val[spot_just_after_match] == ' ' ) 
        {
            val[spot_just_after_match] = ':';
            ChangeMade(CCleanupChange::eChangeQualifiers);
        }

        // remove spaces after first colon
        size_t pos = NStr::Find(val, ":");
        if (pos != string::npos && isspace(val.c_str()[pos + 1])) {
            if (s_RegexpReplace(val, ":[ ]+", ":", 1)) {
                ChangeMade(CCleanupChange::eChangeQualifiers);
            }
        }
    } else {
        NStr::TruncateSpacesInPlace( val, NStr::eTrunc_Begin );
        val = "satellite:" + val;
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

void CNewCleanup_imp::x_FixUpEllipsis( string &str )
{
    if( s_RegexpReplace( str, "[,.][,.][,.]$", "..." ) ) {
        ChangeMade(CCleanupChange::eChangeComment);
    }
}

void CNewCleanup_imp::x_RemoveFlankingQuotes( string &val )
{
    if (val.empty()) {
        return;
    }
    // holds the first and last pos that we will keep
    // (have to use "ints" since might be negative)
    int first_pos = 0;
    size_t last_pos = ( val.length() - 1 );

    // move inwards until there are no more quotes to trim
    for( ; first_pos <= last_pos ; ++first_pos, --last_pos ) {
        const char ch1 = val[first_pos];
        const char ch2 = val[last_pos];
        if( (ch1 != '\'' && ch1 != '\"') || ch1 != ch2 ) {
            break;
        }
    }

    // check if there was no change (this is the case almost always)
    if( 0 == first_pos ) {
        return;
    }

    // check if string is all nested quotes
    if( first_pos > last_pos ) {
        // Just clear it
        val.clear();
    } else {
        val = val.substr( first_pos, (last_pos - first_pos + 1) );
    }

    ChangeMade(CCleanupChange::eTrimFlankingQuotes);
}

static
bool s_IsIllegalQual( const string &qual )
{
    static const char * const sc_Illegal_qual_array[] = {
        "anticodon",
        "citation",
        "codon_start",
        "db_xref",
        "evidence",
        "exception",
        "gene",
        "note",
        "protein_id",
        "pseudo",
        "transcript_id",
        "transl_except",
        "transl_table",
        "translation"
    };
    typedef CStaticArraySet<const char*, PNocase_CStr> TIllegalQualSet;
    DEFINE_STATIC_ARRAY_MAP( TIllegalQualSet, sc_IllegalQualArray, sc_Illegal_qual_array );

    return ( sc_IllegalQualArray.find(qual.c_str()) != sc_IllegalQualArray.end() );
}

static bool s_GbQualCompare (
    const CRef<CGb_qual>& gb1,
    const CRef<CGb_qual>& gb2
)

{
    const CGb_qual& gbq1 = *(gb1);
    const CGb_qual& gbq2 = *(gb2);

    const string& ql1 = GET_FIELD (gbq1, Qual);
    const string& ql2 = GET_FIELD (gbq2, Qual);

    int comp = s_CompareNoCaseCStyle(ql1, ql2);
    if (comp < 0) return true;
    if (comp > 0) return false;

    const string& vl1 = GET_FIELD (gbq1, Val);
    const string& vl2 = GET_FIELD (gbq2, Val);

    if (NStr::CompareNocase (vl1, vl2) < 0) return true;

    return false;
}

static bool s_GbQualCompareLegalFirst (
    const CRef<CGb_qual>& gb1,
    const CRef<CGb_qual>& gb2
)

{
    const CGb_qual& gbq1 = *(gb1);
    const CGb_qual& gbq2 = *(gb2);

    const string& ql1 = GET_FIELD (gbq1, Qual);
    const string& ql2 = GET_FIELD (gbq2, Qual);

    // legal quals first
    const bool is_illegal1 = s_IsIllegalQual(ql1);
    const bool is_illegal2 = s_IsIllegalQual(ql2);
    if( is_illegal1 && ! is_illegal2 ) {
        return false;
    } else if( ! is_illegal1 && is_illegal2 ) {
        return true;
    }

    return s_GbQualCompare(gb1, gb2);
}

static bool s_GbQualEqual (
    const CRef<CGb_qual>& gb1,
    const CRef<CGb_qual>& gb2
)

{
    const CGb_qual& gbq1 = *(gb1);
    const CGb_qual& gbq2 = *(gb2);

    const string& ql1 = GET_FIELD (gbq1, Qual);
    const string& ql2 = GET_FIELD (gbq2, Qual);

    if (! NStr::EqualNocase (ql1, ql2)) return false;

    const string& vl1 = GET_FIELD (gbq1, Val);
    const string& vl2 = GET_FIELD (gbq2, Val);

    if (! NStr::EqualNocase (vl1, vl2)) return false;

    return true;
}

void CNewCleanup_imp::Except_textBC (
    string& except_text
)

{
    if (NStr::Find (except_text, "ribosome slippage") == NPOS &&
        NStr::Find (except_text, "ribosome-slippage") == NPOS &&
        NStr::Find (except_text, "ribosome_slippage") == NPOS &&
        NStr::Find (except_text, "ribosomal-slippage") == NPOS &&
        NStr::Find (except_text, "ribosomal_slippage") == NPOS &&
        NStr::Find (except_text, "trans splicing") == NPOS &&
        NStr::Find (except_text, "trans_splicing") == NPOS &&
        NStr::Find (except_text, "alternate processing") == NPOS &&
        NStr::Find (except_text, "adjusted for low quality genome") == NPOS &&
        NStr::Find (except_text, "non-consensus splice site") == NPOS) {
        return ;
    }

    vector<string> exceptions;
    NStr::Split(except_text, ",", exceptions, NStr::fSplit_Tokenize);

    EDIT_EACH_STRING_IN_VECTOR (it, exceptions) {
        string& text = *it;
        size_t tlen = text.length();
        NStr::TruncateSpacesInPlace (text);
        if (text.length() != tlen) {
            ChangeMade (CCleanupChange::eTrimSpaces);
        }
        if (! text.empty()) {
            if (text == "ribosome slippage" || text == "ribosome-slippage" || text == "ribosome_slippage" ||
                text == "ribosomal-slippage" || text == "ribosomal_slippage") {
                text = "ribosomal slippage";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "trans splicing" || text == "trans_splicing") {
                text = "trans-splicing";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "alternate processing") {
                text = "alternative processing";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "adjusted for low quality genome") {
                text = "adjusted for low-quality genome";
                ChangeMade (CCleanupChange::eChangeException);
            } else if (text == "non-consensus splice site") {
                text = "nonconsensus splice site";
                ChangeMade (CCleanupChange::eChangeException);
            }
        }
    }

    except_text = NStr::Join (exceptions, ", ");
}

static
bool s_SeqLocAnyNull( const CSeq_loc & loc )
{
    CSeq_loc_CI loc_ci( loc, CSeq_loc_CI::eEmpty_Allow);
    for( ; loc_ci; ++loc_ci ) {
        const CSeq_loc& loc_piece = loc_ci.GetEmbeddingSeq_loc();
        if( loc_piece.IsNull() ) {
            return true;
        }
    }

    return false;
}


bool SortGBQuals(CSeq_feat& sf)
{
    if (!sf.IsSetQual()) {
        return false;
    }
    if (sf.IsSetQual() && sf.GetQual().size() == 0) {
        sf.ResetQual();
        return true;
    }

    CRef<CSeq_feat> orig(new CSeq_feat());
    orig->Assign(sf);

    // first, extract product qualifier values, because order must be
    // preserved
    vector<string> products;
    auto& qualset = sf.SetQual();
    CSeq_feat::TQual::iterator it = qualset.begin();
    while (it != qualset.end()) {
        if ((*it)->IsSetQual() && NStr::EqualNocase((*it)->GetQual(), "product")) {
            if ((*it)->IsSetVal() && !NStr::IsBlank((*it)->GetVal())) {
                products.push_back((*it)->GetVal());
            }
            it = qualset.erase(it);
        } else {
            ++it;
        }
    }

    if (sf.GetQual().size() > 1) {
        SORT_GBQUAL_ON_SEQFEAT(sf, s_GbQualCompareLegalFirst);
    }

    // insert product qualifiers back in list
    it = qualset.begin();
    while (it != qualset.end()) {
        if (!(*it)->IsSetQual() ||            
            s_CompareNoCaseCStyle("product", (*it)->GetQual()) < 0 ||
            s_IsIllegalQual((*it)->GetQual())) {
            break;
        }
        ++it;
    }
    if (it == qualset.end()) {
        ITERATE(vector<string>, s, products) {
            qualset.emplace_back(new CGb_qual("product", *s));
        }
    } else {
        ITERATE(vector<string>, s, products) {
            CRef<CGb_qual> pq(new CGb_qual("product", *s));
            it = qualset.insert(it, pq);
        }
    }
    return !(orig->Equals(sf));          
}


void CNewCleanup_imp::x_ConvertGoQualifiers(CSeq_feat& sf)
{
    if (!sf.CanGetQual()) {
        return;
    }
    auto& quals = sf.SetQual();
    //auto& ext = sf.SetExt();
    for (auto qualIt = quals.begin(); qualIt != quals.end(); /**/) {
        auto& qualRef = *qualIt;
        if (!qualRef->CanGetQual()  ||  !NStr::StartsWith(qualRef->GetQual(), "go_")) {
            ++qualIt;
            continue;
        }
        try {
            CReadUtil::AddGeneOntologyTerm(sf, qualRef->GetQual(), qualRef->GetVal()); 
            qualIt = quals.erase(qualIt);
            ChangeMade (CCleanupChange::eMoveGeneOntologyTerm);
        }
        catch (CException&) {
            //just silence
        };
    }
}

void CNewCleanup_imp::x_CleanSeqFeatQuals(CSeq_feat& sf)
{
    if (!sf.IsSetQual()) {
        return;
    }
    // clean before uniquing
    EDIT_EACH_GBQUAL_ON_SEQFEAT(gbq_it, sf) {
        CGb_qual& gbq = **gbq_it;
        GBQualBC(gbq);
    }

    // sort/unique gbquals, just alphabetically
    if (SortGBQuals(sf)) {
        ChangeMade(CCleanupChange::eCleanQualifiers);
    }

    if (!GBQUAL_ON_SEQFEAT_IS_UNIQUE(sf, s_GbQualEqual)) {
        UNIQUE_GBQUAL_ON_SEQFEAT(sf, s_GbQualEqual);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }

    // move quals to other parts of the feature as appropriate
    if (CCleanup::ParseCodeBreaks(sf, *m_Scope)) {
        ChangeMade(CCleanupChange::eChangeCodeBreak);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }
    if (!sf.IsSetQual()) {
        return;
    }
    EDIT_EACH_GBQUAL_ON_SEQFEAT(gbq_it, sf) {
        CGb_qual& gbq = **gbq_it;
        if (GBQualSeqFeatBC(gbq, sf) == eAction_Erase)
        {
            ERASE_GBQUAL_ON_SEQFEAT(gbq_it, sf);
            ChangeMade(CCleanupChange::eRemoveQualifier);
        }
    }

    REMOVE_IF_EMPTY_GBQUAL_ON_SEQFEAT(sf);
}

void CNewCleanup_imp::SeqfeatBC (
    CSeq_feat& sf
)

{
    // note - need to clean up GBQuals before dbxrefs, because they may be converted to populate other fields
    x_CleanSeqFeatQuals(sf);
    x_ConvertGoQualifiers(sf);

    CLEAN_STRING_MEMBER (sf, Title);

    if( FIELD_EQUALS( sf, Except, false ) ) {
        RESET_FIELD( sf, Except );
        ChangeMade (CCleanupChange::eRemoveException);
    }

    if( FIELD_EQUALS( sf, Pseudo, false ) ) {
        RESET_FIELD( sf, Pseudo );
        ChangeMade (CCleanupChange::eRemoveException);
    }

    if( FIELD_EQUALS( sf, Partial, false ) ) {
        RESET_FIELD( sf, Partial );
        ChangeMade (CCleanupChange::eRemoveException);
    }

    CLEAN_STRING_MEMBER (sf, Except_text);
    if (FIELD_IS_SET (sf, Except_text)) {
        string &et = GET_MUTABLE (sf, Except_text);
        Except_textBC (et);
        if( FIELD_EQUALS(sf, Except, true) && FIELD_EQUALS(sf, Comment, et) ) {
            RESET_FIELD(sf, Comment);
            ChangeMade (CCleanupChange::eRemoveComment);
        }
    }

    vector< CRef< CDbtag > > new_dbtags;
    EDIT_EACH_DBXREF_ON_SEQFEAT (dbx_it, sf) {
        CDbtag& dbt = **dbx_it;
        DbtagBC(dbt);
        x_SplitDbtag(dbt, new_dbtags );
    }
    if( ! new_dbtags.empty() ) {
        copy( new_dbtags.begin(), new_dbtags.end(), back_inserter(sf.SetDbxref()) );
    }

    // sort dbxrefs
    if (!DBXREF_ON_SEQFEAT_IS_SORTED(sf, s_DbtagCompare)) {
        SORT_DBXREF_ON_SEQFEAT(sf, s_DbtagCompare);
        ChangeMade(CCleanupChange::eCleanDbxrefs);
    }


    CALL_IF_SET( PubSetBC, sf, Cit );

    if (!CSeqFeatData::AllowStrandBoth(sf.GetData().GetSubtype())) {
        x_BothStrandBC(sf.SetLocation());
    }
}

void CNewCleanup_imp::x_PostSeqFeat( CSeq_feat& sf )
{
    // need to clean this up in case it was changed by our children
    CLEAN_STRING_MEMBER (sf, Comment);
    CALL_IF_SET( CleanDoubleQuote, sf, Comment );
    if ( STRING_FIELD_MATCH( sf, Comment, "." ) ) {
        RESET_FIELD (sf, Comment);
        ChangeMade (CCleanupChange::eChangeComment);
    }

    // sort/unique gbquals (yes, must do before *and* after )
    x_CleanSeqFeatQuals(sf);

    EDIT_EACH_DBXREF_ON_SEQFEAT (dbx_it, sf) {
        CDbtag& dbt = **dbx_it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_SEQFEAT (dbx_it, sf);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/unique db_xrefs
    if (! DBXREF_ON_SEQFEAT_IS_SORTED (sf, s_DbtagCompare)) {
        SORT_DBXREF_ON_SEQFEAT (sf, s_DbtagCompare);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    if (! DBXREF_ON_SEQFEAT_IS_UNIQUE (sf, s_DbtagEqual)) {
        UNIQUE_DBXREF_ON_SEQFEAT (sf, s_DbtagEqual);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    REMOVE_IF_EMPTY_DBXREF_ON_SEQFEAT( sf );

    // xrefs
    EDIT_EACH_SEQFEATXREF_ON_SEQFEAT( xref_iter, sf ) {
        CSeqFeatXref &xref = **xref_iter;
        if( ! FIELD_IS_SET(xref, Id) && ! FIELD_IS_SET(xref, Data) ) {
            ERASE_SEQFEATXREF_ON_SEQFEAT(xref_iter, sf);
            ChangeMade (CCleanupChange::eCleanSeqFeatXrefs);
        }
    }
    REMOVE_IF_EMPTY_SEQFEATXREF_ON_SEQFEAT( sf );

    // clean up partial flag
    const unsigned int partial_loc_mask = ( 
        sequence::eSeqlocPartial_Start      | 
        sequence::eSeqlocPartial_Stop       );
    const unsigned int partial_loc = 
        sequence::SeqLocPartialCheck( GET_FIELD( sf, Location ), m_Scope );
    if ( FIELD_EQUALS(sf, Partial, true) ) {
        // do nothing, will not change partial if already set
    } else if ( (partial_loc & partial_loc_mask) || ( s_SeqLocAnyNull( GET_FIELD( sf, Location ) ) && ! m_IsEmblOrDdbj) ) {
        SET_FIELD( sf, Partial, true );
        ChangeMade (CCleanupChange::eChangePartial);
    }
}

static bool
s_GeneSynCompareCS(
    const string &syn1,
    const string &syn2 )
{
    return ( syn1 < syn2 );
}

static bool
s_GeneSynEqual(
    const string &syn1,
    const string &syn2 )
{
    return syn1 == syn2;
}

// CILCFirst stands for "case-insensitive, lower-case first"
static bool
s_GeneSynCompareCILCFirst(
    const string &syn1,
    const string &syn2 )
{
    int nocase_compare = s_CompareNoCaseCStyle( syn1, syn2 );
    if( nocase_compare != 0 ) {
        return nocase_compare < 0;
    }

    // notice reversal, so that lowercase is first
    return ( syn2 < syn1 );
}

class CStringIsEmpty
{
public:
    bool operator()( const string &str ) const { return str.empty(); }
};

// returns true if a split was done and added to gene_syns_to_add
// gene_syns_to_add is unaffected if syn was not split.
bool s_SplitGeneSyn( const string &syn, vector<string> &gene_syns_to_add)
{
    // preliminary quick-test
    if( syn.find_first_of(",;") == NPOS ) {
        return false;
    }

    // split by comma
    vector<string> pieces_split_by_comma;
    NStr::Split(syn, ",", pieces_split_by_comma, NStr::fSplit_Tokenize);

    // now split each of those pieces by "; "
    vector<string> pieces_split_by_semicolon;
    FOR_EACH_STRING_IN_VECTOR( piece_iter, pieces_split_by_comma ) {
        NStr::SplitByPattern(*piece_iter, "; ", pieces_split_by_semicolon);
    }

    if( pieces_split_by_semicolon.size() > 1 ) {
        // copy non-empty pieces, trimming as we go
        EDIT_EACH_STRING_IN_VECTOR( piece_iter, pieces_split_by_semicolon ) {
            CleanVisString( *piece_iter );
            if( ! piece_iter->empty() ) {
                gene_syns_to_add.push_back(*piece_iter);
            }
        }
        return true;
    } else {
        return false;
    }
}

void CNewCleanup_imp::GenerefBC (
    CGene_ref& gr
)

{
    if (gr.IsSetLocus()) {
        // split locus at '|', copy values after first to synonyms
        // VR-746
        vector<string> tokens;
        NStr::Split(gr.GetLocus(), "|", tokens);
        if (tokens.size() > 1) {
            for (size_t i = 1; i < tokens.size(); i++) {
                gr.SetSyn().push_back(tokens[i]);
            }
            gr.SetLocus(tokens[0]);
            ChangeMade(CCleanupChange::eChangeGeneRef);
        }
    }

    // split gene synonyms that have a comma or "; "
    vector<string> gene_syns_to_add;
    EDIT_EACH_SYNONYM_ON_GENEREF (syn_itr, gr) {
        string& syn = *syn_itr;
        if( s_SplitGeneSyn(syn, gene_syns_to_add) ) {
            ERASE_SYNONYM_ON_GENEREF (syn_itr, gr);
            ChangeMade (CCleanupChange::eChangeGeneRef);
        }
    }
    if( ! gene_syns_to_add.empty() ) {
        copy( gene_syns_to_add.begin(), gene_syns_to_add.end(), 
            back_inserter(gr.SetSyn()) );
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }

    if( ! SYNONYM_ON_GENEREF_IS_SORTED(gr, s_GeneSynCompareCS) ) {
        SORT_SYNONYM_ON_GENEREF( gr, s_GeneSynCompareCS );
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }
    if (! SYNONYM_ON_GENEREF_IS_UNIQUE (gr, s_GeneSynEqual)) {
        UNIQUE_SYNONYM_ON_GENEREF(gr, s_GeneSynEqual);
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }
    if( ! SYNONYM_ON_GENEREF_IS_SORTED(gr, s_GeneSynCompareCILCFirst) ) {
        SORT_SYNONYM_ON_GENEREF( gr, s_GeneSynCompareCILCFirst );
        ChangeMade (CCleanupChange::eChangeGeneRef);
    }

    // remove synonyms equal to locus
    if (! FIELD_IS_SET (gr, Locus)) return;
    const string& locus = GET_FIELD (gr, Locus);

    EDIT_EACH_SYNONYM_ON_GENEREF (syn_itr, gr) {
        string& syn = *syn_itr;
        if (NStr::Equal (locus, syn)) {
            ERASE_SYNONYM_ON_GENEREF (syn_itr, gr);
            ChangeMade (CCleanupChange::eChangeGeneRef);
        }
    }

    // remove obsolete or otherwise stale dbxrefs
    EDIT_EACH_DBXREF_ON_GENEREF(it, gr) {
        CDbtag& dbt = **it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_GENEREF (it, gr);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/uniq dbxrefs on generef
    if( ! DBXREF_ON_GENEREF_IS_SORTED(gr, s_DbtagCompare) ) {
        SORT_DBXREF_ON_GENEREF(gr, s_DbtagCompare);
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    }
    if( ! DBXREF_ON_GENEREF_IS_UNIQUE(gr, s_DbtagEqual) ) {
        UNIQUE_DBXREF_ON_GENEREF(gr, s_DbtagEqual);
        ChangeMade(CCleanupChange::eRemoveGeneXref);
    }
}

static bool s_IsEmptyGeneRef (const CGene_ref& gr)

{
    if (FIELD_IS_SET (gr, Locus)) return false;
    if (FIELD_IS_SET (gr, Allele)) return false;
    if (FIELD_IS_SET (gr, Desc)) return false;
    if (FIELD_IS_SET (gr, Maploc)) return false;
    if (FIELD_IS_SET (gr, Db)) return false;
    if (FIELD_IS_SET (gr, Syn)) return false;
    if (FIELD_IS_SET (gr, Locus_tag)) return false;

    return true;
}

static bool s_CommentRedundantWithGeneRef (
    CGene_ref& gene_ref,
    const string& comm
)

{
    if (STRING_FIELD_MATCH (gene_ref, Locus_tag, comm)) return true;
    if (STRING_SET_MATCH   (gene_ref, Syn,       comm)) return true;

    return false;
}

static
CRef<CDbtag> s_DbtagParse( const string &dbtag_str )
{
    CRef<CDbtag> result( new CDbtag );

    string id_str;
    if( ! NStr::SplitInTwo(dbtag_str, ":", result->SetDb(), id_str ) ) {
        return CRef<CDbtag>();
    }

    // checks if a string is all digits
    int id = 0;
    // Note: assignment in "if"
    if( s_IsAllDigits(id_str) && 
        (id = NStr::StringToInt(id_str, NStr::fConvErr_NoThrow)) > 0 )
    {
        result->SetTag().SetId( id );
    } else {
        result->SetTag().SetStr().swap( id_str );
    }

    return result;
}

static
CConstRef<CUser_object> s_FindUserObjectTypeRecursive( const CUser_object &user_obj, const string &sought_type_label );

static 
CConstRef<CUser_object> s_FindUserObjectTypeRecursive_helper( const CUser_field &field, const string &sought_type_label )
{
    if( FIELD_IS_SET(field, Data) ) {
        switch( GET_FIELD(field, Data).Which() ) {
            case CUser_field::C_Data::e_Object:
                return s_FindUserObjectTypeRecursive( GET_FIELD(field, Data).GetObject(), sought_type_label );
                break;
            case CUser_field::C_Data::e_Fields:
                ITERATE( CUser_field::C_Data::TFields, field_iter, GET_FIELD(field, Data).GetFields() ) {
                    CConstRef<CUser_object> result = s_FindUserObjectTypeRecursive_helper( **field_iter, sought_type_label );
                    if( result ) {
                        return result;
                    }
                }
                break;
            case CUser_field::C_Data::e_Objects:
                ITERATE( CUser_field::C_Data::TObjects, obj_iter, GET_FIELD(field, Data).GetObjects() ) {
                    CConstRef<CUser_object> result = s_FindUserObjectTypeRecursive( **obj_iter, sought_type_label );
                    if( result ) {
                        return result;
                    }
                }
                break;
            default:
                break;
        }
    }

    return CConstRef<CUser_object>();
}

static
CConstRef<CUser_object> s_FindUserObjectTypeRecursive( const CUser_object &user_obj, const string &sought_type_label )
{
    // is the one we're given a match?
    if( FIELD_IS_SET_AND_IS(user_obj, Type, Str) && user_obj.GetType().GetStr() == "OfficialNomenclature" ) {
        return CConstRef<CUser_object>( &user_obj );
    }

    // otherwise, recurse downwards depth-first
    FOR_EACH_USERFIELD_ON_USEROBJECT(field_iter, user_obj) {
        CConstRef<CUser_object> result = s_FindUserObjectTypeRecursive_helper( **field_iter, sought_type_label );
        if( result ) {
            return result;
        }
    }

    return CConstRef<CUser_object>();
}


bool s_CopyDbToFeat(CGene_ref& gene_ref, CSeq_feat& seq_feat)
{
    bool any_change = false;
    if (gene_ref.IsSetDb()) {
        for (auto db_itr : gene_ref.GetDb()) {
            CRef<CDbtag> dbc(new CDbtag());
            dbc->Assign(*db_itr);
            seq_feat.SetDbxref().push_back(dbc);
        }
        gene_ref.ResetDb();
        any_change = true;
    }
    return any_change;
}


void CNewCleanup_imp::GeneFeatBC (
    CGene_ref& gene_ref,
    CSeq_feat& seq_feat
)

{
    // move gene.pseudo to feat.pseudo
    if (FIELD_IS_SET (gene_ref, Pseudo)) {
        if( GET_FIELD(gene_ref, Pseudo) ) {
            SET_FIELD (seq_feat, Pseudo, true);
            RESET_FIELD (gene_ref, Pseudo);
            ChangeMade (CCleanupChange::eChangeQualifiers);
        } else {
            RESET_FIELD(seq_feat, Pseudo);
            RESET_FIELD(gene_ref, Pseudo);
            ChangeMade (CCleanupChange::eChangeQualifiers);
        }
    }

    // remove feat.comment if equal to various gene fields
    if (FIELD_IS_SET (seq_feat, Comment)) {
        if (s_CommentRedundantWithGeneRef (gene_ref, GET_FIELD (seq_feat, Comment))) {
            RESET_FIELD (seq_feat, Comment);
            ChangeMade (CCleanupChange::eChangeComment);
        }
    }
        
    // move gene.db to feat.dbxref
    if (s_CopyDbToFeat(gene_ref, seq_feat)) {
        ChangeMade (CCleanupChange::eChangeDbxrefs);
    }
        
    // move feat.xref.gene.db to feat.dbxref
    if (seq_feat.IsSetXref()) {
        auto xr_itr = seq_feat.SetXref().begin();
        while (xr_itr != seq_feat.SetXref().end()) {
            CSeqFeatXref& sfx = **xr_itr;
            if (sfx.IsSetData() && sfx.GetData().IsGene()) {
                CGene_ref& gene_ref = sfx.SetData().SetGene();
                if (s_CopyDbToFeat(gene_ref, seq_feat)) {
                    ChangeMade(CCleanupChange::eChangeDbxrefs);
                }

                if (s_IsEmptyGeneRef(gene_ref)) {
                    xr_itr = seq_feat.SetXref().erase(xr_itr);
                    ChangeMade(CCleanupChange::eChangeDbxrefs);
                    continue;
                }
            }
            ++xr_itr;
        }
    }

    REMOVE_IF_EMPTY_SEQFEATXREF_ON_SEQFEAT(seq_feat);

    // ModernizeGeneFields
    // (that is, create a formal_name from User-objects, if possible)
    if( ! FIELD_IS_SET(gene_ref, Formal_name) && FIELD_IS_SET(seq_feat, Ext)) {
        CConstRef<CUser_object> user_obj_ref = s_FindUserObjectTypeRecursive( GET_FIELD(seq_feat, Ext), "OfficialNomenclature" );
        // FIELD_IS_SET_AND_IS(user_obj, Type, Str) && user_obj.GetType().GetStr() == "OfficialNomenclature"

        if( user_obj_ref ) {
            const CUser_object &user_obj = *user_obj_ref;

            const string *symbol = NULL;
            const string *name = NULL;
            const string *source = NULL;
            CGene_nomenclature::EStatus status = CGene_nomenclature::eStatus_unknown;

            FOR_EACH_USERFIELD_ON_USEROBJECT(user_field_iter, user_obj) {
                const CUser_field &user_field = **user_field_iter;
                if( FIELD_IS_SET_AND_IS(user_field, Label, Str) && FIELD_IS_SET_AND_IS(user_field, Data, Str) ) {
                    const string &label_str = GET_FIELD(user_field.GetLabel(), Str);
                    const string &data_str = GET_FIELD(user_field.GetData(), Str);

                    if( NStr::EqualNocase(label_str, "Symbol") ) {
                        symbol = &data_str;
                    } else if( NStr::EqualNocase(label_str, "Name") ) {
                        name = &data_str;
                    } else if( NStr::EqualNocase(label_str, "DataSource") ) {
                        source = &data_str;
                    } else if( NStr::EqualNocase(label_str, "Status") ) {
                        if( NStr::EqualNocase(data_str, "Official") ) {
                            status = CGene_nomenclature::eStatus_official;
                        } else if( NStr::EqualNocase(data_str, "Interim") ) {
                            status = CGene_nomenclature::eStatus_interim;
                        }
                    } 
                }
            }

            if( (symbol != NULL) || (name != NULL) || (source != NULL) || 
                (status != CGene_nomenclature::eStatus_unknown) ) 
            {
                CGene_nomenclature &gene_nomenclature = GET_MUTABLE(gene_ref, Formal_name);
                if( symbol != NULL ) {
                    gene_nomenclature.SetSymbol(*symbol);
                }
                if( name != NULL ) {
                    gene_nomenclature.SetName(*name);
                }
                if( source != NULL ) {
                    // parse "source" string into a CDbtag
                    CRef<CDbtag> new_dbtag = s_DbtagParse( *source );
                    if( new_dbtag ) {
                        gene_nomenclature.SetSource(*new_dbtag);
                    }
                }
                gene_nomenclature.SetStatus(status);

                if( &GET_FIELD(seq_feat, Ext) == user_obj_ref ) {
                    RESET_FIELD(seq_feat, Ext);
                }

                ChangeMade(CCleanupChange::eCreateGeneNomenclature);
            }
        }
    }
}

void CNewCleanup_imp::ProtNameBC (  std::string & str )
{
    const string::size_type old_length = str.length();
    CleanVisStringJunk (str, true);
    TrimInternalSemicolons (str);

    // Remove tabs
    if (NStr::Find(str, "\t") != NPOS) {
        NStr::ReplaceInPlace(str, "\t", " ");
        ChangeMade(CCleanupChange::eChangeProtNames);
    }

    if (str.length() != old_length) {
        ChangeMade (CCleanupChange::eChangeProtNames);
    }
}

void CNewCleanup_imp::ProtActivityBC (  std::string & str )
{
    const string::size_type old_length = str.length();
    CleanVisStringJunk (str, true);
    TrimInternalSemicolons (str);
    if (str.length() != old_length) {
        ChangeMade (CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::ProtrefBC (
    CProt_ref& prot_ref
)

{
    // "not set" should just be removed
    if( FIELD_EQUALS(prot_ref, Processed, NCBI_PROTREF(not_set) ) ) {
        RESET_FIELD(prot_ref, Processed);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }

    CLEAN_STRING_MEMBER (prot_ref, Desc);
    if (prot_ref.IsSetDesc() &&
        NStr::StartsWith(prot_ref.GetDesc(), "'") &&
        NStr::EndsWith(prot_ref.GetDesc(), "'")) {
        string desc = prot_ref.GetDesc();
        desc = desc.substr(1, desc.length() - 2);
        prot_ref.SetDesc(desc);
        ChangeMade (CCleanupChange::eChangeQualifiers);
    }

    if (prot_ref.IsSetName()) {
        for (auto& it : prot_ref.SetName()) {
            ProtNameBC(it);
            x_CompressStringSpacesMarkChanged(it);
        }
    }


    REMOVE_IF_EMPTY_NAME_ON_PROTREF(prot_ref);

    CLEAN_STRING_LIST (prot_ref, Ec);

    UNIQUE_WITHOUT_SORT_ACTIVITY_ON_PROTREF( prot_ref, PNocase );

    REMOVE_IF_EMPTY_ACTIVITY_ON_PROTREF(prot_ref);

    // rubisco cleanup
    if( m_IsEmblOrDdbj ) {
        EDIT_EACH_NAME_ON_PROTREF (it, prot_ref) {
            if (NStr::EqualNocase (*it, "RbcL") || NStr::EqualNocase(*it, "rubisco large subunit")) {
                *it = "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
                if (prot_ref.IsSetDesc() && NStr::EqualNocase(prot_ref.GetDesc(), "RbcL")) {
                    prot_ref.ResetDesc();
                }
                continue;
            } else if (NStr::EqualNocase (*it, "RbcS") || NStr::EqualNocase(*it, "rubisco small subunit")) {
                *it = "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
                if (prot_ref.IsSetDesc() && NStr::EqualNocase(prot_ref.GetDesc(), "RbcS")) {
                    prot_ref.ResetDesc();
                }
                continue;
            } 

            // This is pretty inefficient, so when there's time we should replace it with a map or something
            if (NStr::Find (*it, "ribulose") != string::npos
                && NStr::Find (*it, "bisphosphate") != string::npos
                && NStr::Find (*it, "methyltransferase") == string::npos
                && !NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit")
                && !NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit")
                && (NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase/oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase, large subunit")
                || NStr::EqualNocase (*it, "large subunit of ribulose-1,5-bisphosphate carboxylase/oxygenase")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase large chain")
                || NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase-oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5 bisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase/oxygenase, large subunit")
                || NStr::EqualNocase (*it, "large subunit of ribulose-1,5-bisphosphate carboxylase/oxgenase")
                || NStr::EqualNocase (*it, "ribulose bisphosphate carboxylase/oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose-1,5-bisphosphate carboxylase oxygenase, large subunit")
                || NStr::EqualNocase (*it, "ribulose 5-bisphosphate carboxylase, large subunit")
                || NStr::EqualNocase (*it, "ribulosebisphosphate carboxylase large subunit")
                || NStr::EqualNocase (*it, "ribulose bisphosphate large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5 bisphosphate carboxylase/oxygenase large subunit")
                || NStr::EqualNocase (*it, "ribulose 1,5-bisphosphate carboxylase/oxygenase large chain")
                || NStr::EqualNocase (*it, "large subunit ribulose-1,5-bisphosphate carboxylase/oxygenase")
                || NStr::EqualNocase (*it, "ribulose-bisphosphate carboxylase, large subunit")
                || NStr::EqualNocase (*it, "ribulose-1, 5-bisphosphate carboxylase/oxygenase large-subunit")) ) {
                    *it = "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit";
                    ChangeMade (CCleanupChange::eChangeQualifiers);
            }
        }
    }

    UNIQUE_WITHOUT_SORT(NAME_ON_PROTREF, prot_ref, PNocase, CCleanupChange::eChangeProtNames);
}

static const char* const uninf_names [] = {
    "peptide",
    "putative",
    "signal",
    "signal peptide",
    "signal-peptide",
    "signal_peptide",
    "transit",
    "transit peptide",
    "transit-peptide",
    "transit_peptide",
    "unknown",
    "unnamed"
};

typedef CStaticArraySet<string, PNocase> TUninformative;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TUninformative, sc_UninfNames, uninf_names);

static bool s_IsInformativeName (
    const string& name
)

{
    return ! name.empty() && sc_UninfNames.find(name) == sc_UninfNames.end();
}

static bool s_CommentRedundantWithProtRef (
    CProt_ref& pr,
    const string& comm
)

{
    if (STRING_SET_MATCH (pr, Name, comm)) return true;
    if (STRING_FIELD_MATCH (pr, Desc, comm)) return true;
    if (STRING_SET_MATCH (pr, Ec, comm)) return true;

    return false;
}

void CNewCleanup_imp::ProtFeatfBC (
    CProt_ref& pr,
    CSeq_feat& sf
)

{
    const TPROTREF_PROCESSED processed = ( FIELD_IS_SET (pr, Processed) ?
        GET_FIELD (pr, Processed) :
        NCBI_PROTREF(not_set) );

    // move putative from comment to protein name for mat peptide
    if (FIELD_IS_SET (sf, Comment) && 
        RAW_FIELD_IS_EMPTY_OR_UNSET(pr, Name) &&
        processed != NCBI_PROTREF(signal_peptide) &&
        processed != NCBI_PROTREF(transit_peptide)) {
            if (! NStr::EqualNocase ("putative", GET_FIELD (sf, Comment))) {
                ADD_NAME_TO_PROTREF ( pr, GET_FIELD (sf, Comment) );
                ChangeMade(CCleanupChange::eChangeProtNames);
                RESET_FIELD (sf, Comment);
                ChangeMade(CCleanupChange::eRemoveComment);
            }
    }

    // move putative to comment, remove uninformative name of signal peptide
    if (FIELD_IS_SET (pr, Name)) {
        if (processed == NCBI_PROTREF(signal_peptide) ||
            processed == NCBI_PROTREF(transit_peptide)) {
                EDIT_EACH_NAME_ON_PROTREF (nm_itr, pr) {
                    string& str = *nm_itr;
                    if (NStr::Find (str, "putative") != NPOS ||
                        NStr::Find (str, "put. ") != NPOS) {
                            if (! FIELD_IS_SET (sf, Comment)) {
                                SET_FIELD (sf, Comment, "putative");
                                ChangeMade (CCleanupChange::eChangeComment);
                            }
                    }
                    if (! s_IsInformativeName (str)) {
                        ERASE_NAME_ON_PROTREF (nm_itr, pr);
                        ChangeMade (CCleanupChange::eChangeProtNames);
                    }
                }
        }

        EDIT_EACH_NAME_ON_PROTREF (nm_itr, pr) {
            string& str = *nm_itr;
            // rubisco
            if (NStr::EqualNocase (str, "RbcL") || NStr::EqualNocase(str, "rubisco large subunit")) {
                str = "ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
            } else if (NStr::EqualNocase (str, "RbcS") || NStr::EqualNocase(str, "rubisco small subunit")) {
                str = "ribulose-1,5-bisphosphate carboxylase/oxygenase small subunit";
                ChangeMade (CCleanupChange::eChangeQualifiers);
            }
        }
    }

    // add unnamed as default protein name
    if ( RAW_FIELD_IS_EMPTY_OR_UNSET(pr, Name) ) {
        if (processed == NCBI_PROTREF(preprotein)  ||  
            processed == NCBI_PROTREF(mature)) {
                ADD_NAME_TO_PROTREF (pr, "unnamed");
                ChangeMade (CCleanupChange::eChangeQualifiers);
        }
    }

    // remove feat.comment if equal to various protein fields
    if (FIELD_IS_SET (sf, Comment)) {
        if (s_CommentRedundantWithProtRef (pr, GET_FIELD (sf, Comment))) {
            RESET_FIELD (sf, Comment);
            ChangeMade (CCleanupChange::eChangeComment);
        }
    }
        
    // move prot.db to feat.dbxref
    if (pr.IsSetDb()) {
        auto& sfxref = sf.SetDbxref();
        sfxref.insert(sfxref.end(), pr.SetDb().begin(), pr.SetDb().end());
        pr.ResetDb();
        ChangeMade(CCleanupChange::eChangeDbxrefs);
    }

    REMOVE_IF_EMPTY_NAME_ON_PROTREF(pr);
}

void CNewCleanup_imp::PostProtFeatfBC (
    CProt_ref& prot_ref
)
{
    // remove obsolete/stale Dbtags
    EDIT_EACH_DBXREF_ON_PROTREF (dbx_it, prot_ref) {
        CDbtag& dbt = **dbx_it;
        if (s_DbtagIsBad (dbt)) {
            ERASE_DBXREF_ON_PROTREF (dbx_it, prot_ref);
            ChangeMade (CCleanupChange::eCleanDbxrefs);
        }
    }

    // sort/uniq the dbxrefs
    if (! DBXREF_ON_PROTREF_IS_SORTED (prot_ref, s_DbtagCompare)) {
        SORT_DBXREF_ON_PROTREF (prot_ref, s_DbtagCompare);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }
    if (! DBXREF_ON_PROTREF_IS_UNIQUE (prot_ref, s_DbtagEqual)) {
        UNIQUE_DBXREF_ON_PROTREF (prot_ref, s_DbtagEqual);
        ChangeMade (CCleanupChange::eCleanDbxrefs);
    }

    if( prot_ref.IsSetDesc() && NStr::IsBlank(prot_ref.GetDesc()) ) {
        prot_ref.ResetDesc();
        ChangeMade (CCleanupChange::eChangeProtNames);
    }
}


typedef SStaticPair<const char*, const char*>  TInTrSpElem;
static const TInTrSpElem sc_its_map[] = {
    { "internal transcribed spacer 1 (ITS1)", "internal transcribed spacer 1" },
    { "internal transcribed spacer 2 (ITS2)", "internal transcribed spacer 2" },
    { "internal transcribed spacer 3 (ITS3)", "internal transcribed spacer 3" },
    { "its 1", "internal transcribed spacer 1" },
    { "its 2", "internal transcribed spacer 2" },
    { "its 3", "internal transcribed spacer 3" },
    { "its1", "internal transcribed spacer 1" },
    { "its2", "internal transcribed spacer 2" },
    { "its3", "internal transcribed spacer 3" },
    { "Ribosomal DNA internal transcribed spacer 1", "internal transcribed spacer 1" },
    { "Ribosomal DNA internal transcribed spacer 2", "internal transcribed spacer 2" },
    { "Ribosomal DNA internal transcribed spacer 3", "internal transcribed spacer 3" }
};
typedef CStaticArrayMap<string, string, PNocase> TInTrSpMap;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TInTrSpMap, sc_ITSMap, sc_its_map);


bool CNewCleanup_imp::IsInternalTranscribedSpacer(const string& name)
{
    if (NStr::EqualNocase(name, "internal transcribed spacer 1") ||
        NStr::EqualNocase(name, "internal transcribed spacer 2") ||
        NStr::EqualNocase(name, "internal transcribed spacer 3")) {
        return true;
    }
    return false;
}


bool CNewCleanup_imp::TranslateITSName( string &in_out_name )
{
    TInTrSpMap::const_iterator its_iter = sc_ITSMap.find(in_out_name);
    if( its_iter != sc_ITSMap.end() ) {
        in_out_name = its_iter->second;
        return true;
    } else {
        return false;
    }
}


void CNewCleanup_imp::x_TranslateITSNameAndFlag( string &in_out_name )
{
    if (TranslateITSName( in_out_name )) {
        ChangeMade(CCleanupChange::eChangeITS);
    }
}

static const char* const ncrna_names [] = {
    "antisense_RNA",
    "autocatalytically_spliced_intron",
    "guide_RNA",
    "hammerhead_ribozyme",
    "lncRNA",
    "miRNA",
    "other",
    "piRNA",
    "rasiRNA",
    "ribozyme",
    "RNase_MRP_RNA",
    "RNase_P_RNA",
    "scRNA",
    "siRNA",
    "snoRNA",
    "snRNA",
    "SRP_RNA",
    "telomerase_RNA",
    "vault_RNA",
    "Y_RNA"
};

typedef CStaticArraySet<string, PNocase> TNcrna;
DEFINE_STATIC_ARRAY_MAP_WITH_COPY(TNcrna, sc_NcrnafNames, ncrna_names);

static bool s_IsNcrnaName (
    const string& name
)

{
    return sc_NcrnafNames.find(name) != sc_NcrnafNames.end();
}

static bool s_StartsWithNcrnaName( 
    const string& name,
    string &out_ncrna_name)
{
    string tmp_name = name;
    size_t pos = NStr::Find(name, " ");
    if (pos != NPOS) {
        tmp_name = name.substr(0, pos);
    }
    if (!NStr::EqualNocase(tmp_name, "other") && CRNA_gen::IsLegalClass(tmp_name)) {
        out_ncrna_name = tmp_name;
        CRNA_gen::FixncRNAClassValue(out_ncrna_name);
        return true;
    } else {
        return false;
    }
}

// special exception for genome pipeline rRNA names
static
bool s_NotExceptedRibosomalName( const string &name )
{
    // we are "not excepted" if there is a non-space/non-digit somewhere after " ribosomal"
    CCachedRegexp regex = regexpCache.Get(" ribosomal.*[^ 0-9]");
    return regex->IsMatch(name);
}


void 
CNewCleanup_imp::x_RRNANameBC( string &name )
{
    const string original_name = name;

    if ( name.length() > 5 && s_NotExceptedRibosomalName (name)) {
        // suffix is *after* first match of suffix_regex
        CCachedRegexp suffix_regex = regexpCache.Get( 
            " (ribosomal|rRNA) ( ?RNA)?( ?DNA)?( ?ribosomal)?" );
        if( suffix_regex->IsMatch(name) ) {

            // extract suffix
            const SIZE_TYPE suff_pos = ( suffix_regex->GetResults(0)[1] );
            string suff = name.substr(suff_pos);
            NStr::TruncateSpacesInPlace(suff);

            // cut ribosomal stuff off of name
            const SIZE_TYPE ribosomal_pos = suffix_regex->GetResults(0)[0];
            name.resize( ribosomal_pos );

            name += " ribosomal RNA"; 
            if ( ! suff.empty() ) {
                if (suff[0] != ',' && suff[0] != ';') {
                    name += " ";
                }
                name += suff;
            }
        }
    }
    if ( name.length() > 5) {
        // pos is the position of the first non-digit, non-dot character
        SIZE_TYPE pos = name.find_first_not_of(".0123456789");
        if( NPOS != pos ) {
            if( name[pos] == 's' && name[pos+1] == ' ' ) {
                name[pos] = 'S';
            }
        }
    }
    x_StripSpacesMarkChanged (name);

    // remove duplicate words and similar corrections
    // ( Behold the power of regular expressions; This while loop was about 80 lines in C. )
    do {
        x_StripSpacesMarkChanged(name);
    } while( s_RegexpReplace( name, "ribosomal +ribosomal", "ribosomal ") || 
           s_RegexpReplace( name, "RNA +RNA", "RNA ") || 
           s_RegexpReplace( name, "ribosomal +RNA +ribosomal", "ribosomal RNA ") ||
           s_RegexpReplace( name, "ribosomal +rRNA", "ribosomal RNA ") ||
           s_RegexpReplace( name, "RNA +rRNA", "RNA ") );

    NStr::TruncateSpacesInPlace(name);
    if (NStr::EndsWith(name, ".")) {
        name = name.substr(0, name.length() - 1);
        NStr::TruncateSpacesInPlace(name);
    }

    if( original_name != name ) {
        ChangeMade(CCleanupChange::eChangeRNAref);
    }
}

void CNewCleanup_imp::RnarefGenBC(CRNA_ref& rr)
{
    bool changed = false;

    CRNA_ref::C_Ext& ext = GET_MUTABLE(rr, Ext);
    CRNA_gen& gen = GET_MUTABLE(ext, Gen);

    if (FIELD_IS_SET(gen, Class)) {
        const string& str = GET_FIELD(gen, Class);
        if (NStr::IsBlank(str)) {
            RESET_FIELD(gen, Class);
            ChangeMade(CCleanupChange::eChangeRNAref);
        }
        else {
            string& class_val = GET_MUTABLE(gen, Class);
            if (CRNA_gen::FixncRNAClassValue(class_val)) {
                ChangeMade(CCleanupChange::eChangeRNAref);
            }
        }
    }
    if (FIELD_IS_SET(gen, Product)) {
        const string& str = GET_FIELD(gen, Product);
        if (NStr::IsBlank(str)) {
            RESET_FIELD(gen, Product);
            ChangeMade(CCleanupChange::eChangeRNAref);
        }
    }
    if (FIELD_IS_SET(gen, Quals)) {
        CRNA_qual_set& qset = GET_MUTABLE(gen, Quals);
        EDIT_EACH_QUAL_ON_RNAQSET(qitr, qset) {
            CRNA_qual& qual = **qitr;
            CLEAN_STRING_MEMBER(qual, Qual);
            CLEAN_STRING_MEMBER(qual, Val);
            if (!FIELD_IS_SET(qual, Qual) || !FIELD_IS_SET(qual, Val)) {
                ERASE_QUAL_ON_RNAQSET(qitr, qset);
                ChangeMade(CCleanupChange::eChangeRNAref);
            }
        }

        if (QUAL_ON_RNAQSET_IS_EMPTY(qset)) {
            RESET_FIELD(gen, Quals);
            ChangeMade(CCleanupChange::eChangeRNAref);
        }
    }

    if (FIELD_EQUALS(rr, Type, NCBI_RNAREF(miscRNA)) &&
        FIELD_IS_SET(gen, Product) &&
        !FIELD_IS_SET(gen, Class)) {
        string & product = GET_MUTABLE(gen, Product);
        string ncrna_name = kEmptyStr;
        if (s_StartsWithNcrnaName(product, ncrna_name)) {
            if (product.length() > (ncrna_name.length() + 1) &&
                product[ncrna_name.length()] == ' ') {
                SET_FIELD(gen, Class, ncrna_name);
                SET_FIELD(gen, Product, product.substr(ncrna_name.length() + 1));
                TRUNCATE_SPACES(gen, Class);
                TRUNCATE_SPACES(gen, Product);
                SET_FIELD(rr, Type, NCBI_RNAREF(ncRNA));
                ChangeMade(CCleanupChange::eChangeRNAref);
            }
        }
    }

    if ((FIELD_EQUALS(rr, Type, NCBI_RNAREF(mRNA)) ||
        FIELD_EQUALS(rr, Type, NCBI_RNAREF(rRNA))) &&
        STRING_FIELD_NOT_EMPTY(gen, Product) &&
        RAW_FIELD_IS_EMPTY_OR_UNSET(gen, Class) &&
        !FIELD_IS_SET(gen, Quals)) {
        // convert RNA-Gen to name.
        // Careful: this invalidates the "gen" variable.
        const string product = GET_FIELD(gen, Product);
        SET_FIELD(ext, Name, product);
        return;
    }

    if (!FIELD_IS_SET(gen, Class) &&
        !FIELD_IS_SET(gen, Product) &&
        !FIELD_IS_SET(gen, Quals)) {
        RESET_FIELD(rr, Ext);
        ChangeMade(CCleanupChange::eChangeRNAref);
    }
}

void CNewCleanup_imp::RnarefBC (
    CRNA_ref& rr
)

{
    if (FIELD_IS_SET (rr, Ext)) {
        CRNA_ref::C_Ext& ext = GET_MUTABLE (rr, Ext);
        const TRNAREF_EXT chs = ext.Which();
        switch (chs) {
            case NCBI_RNAEXT(Name):
                {
                    string& name = GET_MUTABLE (ext, Name);
                    if (NStr::IsBlank (name)) {
                        RESET_FIELD (rr, Ext);
                        ChangeMade(CCleanupChange::eChangeRNAref);
                        break;
                    }

                    static const string rRNA = " rRNA";
                    static const string rRNA2 = "_rRNA";
                    static const string kRibosomal_Rna = " ribosomal RNA";
                    static const string kRibosomal_r_Rna = " ribosomal rRNA";

                    if (rr.IsSetType()) {
                        switch (rr.GetType()) {
                            case CRNA_ref::eType_rRNA:
                            {{
                                size_t len = name.length();
                                if (len >= rRNA.length() ) {
                                    if( NStr::EndsWith(name, rRNA, NStr::eNocase) || NStr::EndsWith(name, rRNA2, NStr::eNocase) ) {
                                        if( NStr::EndsWith(name, kRibosomal_r_Rna, NStr::eNocase) ) {
                                            name.replace(len - kRibosomal_r_Rna.length(), name.size(), kRibosomal_Rna);
                                        } else {
                                            name.replace(len - rRNA.length(), name.size(), kRibosomal_Rna);
                                        }
                                        ChangeMade(CCleanupChange::eChangeQualifiers);
                                    }
                                }

                                x_RRNANameBC( name );

                                break;
                            }}
                            case CRNA_ref::eType_other:
                            case CRNA_ref::eType_miscRNA:
                                {{
                                    x_TranslateITSNameAndFlag(name); 

                                    // convert to RNA-gen
                                    string name_copy; // copy because name is about to be destroyed
                                    name_copy.swap( name );
                                    ext.SetGen().SetProduct( name_copy );
                                    ChangeMade(CCleanupChange::eChangeRNAref);
                                }}
                                break;
                            default:
                                break;
                        }
                    }
                }
                break;
            case NCBI_RNAEXT(TRNA):
                {
                    CTrna_ext& tRNA = GET_MUTABLE (ext, TRNA);
                    if (FIELD_IS_SET (tRNA, Aa)) {
                        const CTrna_ext::C_Aa& aa = GET_FIELD (tRNA, Aa);
                        if (aa.Which() == CTrna_ext::C_Aa::e_not_set) {
                            RESET_FIELD (tRNA, Aa);
                            ChangeMade(CCleanupChange::eChangeRNAref);
                        }
                    }

                    if (! CODON_ON_TRNAEXT_IS_SORTED(tRNA, s_CodonCompare)) {
                        SORT_CODON_ON_TRNAEXT(tRNA, s_CodonCompare);
                        ChangeMade(CCleanupChange::eChange_tRna);
                    }

                    if( ! CODON_ON_TRNAEXT_IS_UNIQUE(tRNA, s_CodonEqual) ) {
                        UNIQUE_CODON_ON_TRNAEXT(tRNA, s_CodonEqual);
                        ChangeMade(CCleanupChange::eChange_tRna);
                    }

                    REMOVE_IF_EMPTY_CODON_ON_TRNAEXT(tRNA);

                }
                break;
            case NCBI_RNAEXT(Gen):
                {
                    RnarefGenBC(rr);
                }
                break;
            default:
                break;
        }
    }

    if (FIELD_IS_SET (rr, Type)) {
        TRNAREF_TYPE typ = GET_FIELD (rr, Type);
        switch (typ) {
            case NCBI_RNAREF(mRNA):
                {
                }
                break;
            case NCBI_RNAREF(tRNA):
                {
                }
                break;
            case NCBI_RNAREF(rRNA):
                {
                }
                break;
            case NCBI_RNAREF(other):
                {
                    if (FIELD_IS_SET (rr, Ext)) {
                        CRNA_ref::C_Ext& ext = GET_MUTABLE (rr, Ext);
                        const TRNAREF_EXT chs = ext.Which();
                        if (chs == NCBI_RNAEXT(Name)) {
                            string& str = GET_MUTABLE (ext, Name);
                            if ( str.empty() || NStr::EqualNocase (str, "misc_RNA")) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(miscRNA) );
                                RESET_FIELD(rr, Ext);
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else if (NStr::EqualNocase (str, "ncRNA")) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(ncRNA) );
                                RESET_FIELD(rr, Ext);
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else if (NStr::EqualNocase (str, "tmRNA")) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(tmRNA) );
                                RESET_FIELD(rr, Ext);
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else if (s_IsNcrnaName (str)) {
                                SET_FIELD( rr, Type, NCBI_RNAREF(ncRNA) );
                                const string new_class = str;
                                SET_FIELD( rr.SetExt().SetGen(), Class, new_class );
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            } else {
                                SET_FIELD( rr, Type, NCBI_RNAREF(miscRNA) );
                                const string new_product = str;
                                SET_FIELD( rr.SetExt().SetGen(), Product, new_product );
                                ChangeMade(CCleanupChange::eChangeRNAref);
                            }
                        }
                    } else {
                        SET_FIELD( rr, Type, NCBI_RNAREF(miscRNA) );
                        ChangeMade(CCleanupChange::eChangeRNAref);
                    }
                }
                break;
            default:
                break;
        }
    }
}

void
CNewCleanup_imp::x_AddNonCopiedQual( 
    vector< CRef< CGb_qual > > &out_quals, const char *qual, const char *val )
{
    // bail out if this qual already exists
    ITERATE( vector< CRef< CGb_qual > >, qual_iter, out_quals ) {
        if( (*qual_iter)->IsSetQual() && (*qual_iter)->GetQual() == qual &&
            (*qual_iter)->IsSetVal()  && (*qual_iter)->GetVal()  == val ) 
        {
                return;
        }
    }

    CRef< CGb_qual > new_qual( new CGb_qual(qual, val) );
    out_quals.push_back( new_qual );
    ChangeMade( CCleanupChange::eAddQualifier );
}

void CNewCleanup_imp::x_GBQualToOrgRef( COrg_ref &org, CSeq_feat &seqfeat )
{
    if( ! FIELD_IS_SET( seqfeat, Qual ) ) {
        return;
    }

    EDIT_EACH_GBQUAL_ON_SEQFEAT( qual_iter, seqfeat ) {
        CGb_qual &gb_qual = **qual_iter;
        if( FIELD_IS_SET(gb_qual, Qual) && FIELD_IS_SET(gb_qual, Val) ) {
            const string qual = NStr::Replace( GET_FIELD(gb_qual, Qual), "_", "-" );
            const string &val = GET_FIELD(gb_qual, Val);

            // determine whether we should convert this gbqual into an orgmod
            string mod_val = qual + "=" + val;
            size_t val_pos;
            COrgMod::TSubtype ost;
            CSubSource::TSubtype sst;
            bool do_gbqual_to_orgmod = 
                s_StringHasOrgModPrefix(mod_val, val_pos, ost) ||
                s_StringHasSubSourcePrefix(mod_val, val_pos, sst);

            // if required, do the conversion
            if( do_gbqual_to_orgmod ) {
                org.SetMod().push_back( mod_val );
                ERASE_GBQUAL_ON_SEQFEAT( qual_iter, seqfeat );
                ChangeMade(CCleanupChange::eAddOrgMod);
                ChangeMade(CCleanupChange::eRemoveQualifier);
            }
        }
    }
}

void CNewCleanup_imp::x_MoveSeqdescOrgToSourceOrg( CSeqdesc &seqdesc )
{
    if( seqdesc.IsOrg() ) {
        // wrap Org_ref in BioSource
        CRef <COrg_ref> org ( &GET_MUTABLE(seqdesc, Org) );
        seqdesc.SetSource().SetOrg(*org);
        ChangeMade (CCleanupChange::eMoveDescriptor);
    }
}


void CNewCleanup_imp::x_MoveSeqfeatOrgToSourceOrg( CSeq_feat &seqfeat )
{
    if( FIELD_IS_SET_AND_IS(seqfeat, Data, Org) ) {
        // wrap Org_ref in BioSource
        CRef <COrg_ref> org ( &GET_MUTABLE(seqfeat.SetData(), Org) );
        seqfeat.SetData().SetBiosrc().SetOrg(*org);
        ChangeMade (CCleanupChange::eConvertFeature);
    }
}


// Part of ExtendedCleanup
void CNewCleanup_imp::x_MoveCDSFromNucAnnotToSetAnnot( CBioseq_set &set )
{
    if (set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_nuc_prot) {
        _ASSERT(set.GetParentEntry());
        CSeq_entry_Handle seh = m_Scope->GetSeq_entryHandle(*(set.GetParentEntry()));
        SAnnotSelector sel(CSeqFeatData::e_Cdregion);
        CFeat_CI fi(seh, sel);
        while (fi) {
            if ((fi->IsSetProduct() || sequence::GetLength(fi->GetLocation(), m_Scope) >= 6) && 
                (!fi->IsSetPseudo() || !fi->GetPseudo())) {
                CSeq_feat_Handle fh = fi->GetSeq_feat_Handle();
                if (feature::PromoteCDSToNucProtSet(fh)) {
                    ChangeMade(CCleanupChange::eMoveFeat);
                }
            }
            ++fi;
        }
    }
}


void CNewCleanup_imp::x_CleanupStringMarkChanged( std::string &str )
{
    if (CleanVisString (str)) {
        ChangeMade (CCleanupChange::eTrimSpaces);
    }
}

void CNewCleanup_imp::x_CleanupStringJunkMarkChanged( std::string &str )
{
    if (CleanVisStringJunk (str)) {
        ChangeMade (CCleanupChange::eTrimSpaces);
    }
}



bool CNewCleanup_imp::x_CompressSpaces( string &str )
{
    return Asn2gnbkCompressSpaces(str);
}

void CNewCleanup_imp::x_CompressStringSpacesMarkChanged( std::string &str )
{
  const string::size_type old_length = str.length();

  x_CompressSpaces( str );

  const string::size_type new_length = str.length();
  if( old_length != new_length ) {
    ChangeMade (CCleanupChange::eCompressSpaces);
  }
}

void CNewCleanup_imp::x_ConvertDoubleQuotesMarkChanged( std::string &str )
{
    if( CleanDoubleQuote(str) ) {
        ChangeMade (CCleanupChange::eCleanDoubleQuotes);
    }
}

void CNewCleanup_imp::x_AddNcbiCleanupObject( CSeq_entry &seq_entry )
{
    // remove from lower levels
    if (seq_entry.IsSet() && seq_entry.GetSet().IsSetSeq_set()) {
        for (auto it : seq_entry.GetSet().GetSeq_set()) {
            CCleanup::RemoveNcbiCleanupObject(*it);
        }
    }

    // update existing
    if (seq_entry.IsSetDescr()) {
        auto& dset = seq_entry.SetDescr().Set();
        for (auto it : dset) {
            if (it->IsUser() && it->GetUser().GetObjectType() == CUser_object::eObjectType_Cleanup) {
                it->SetUser().UpdateNcbiCleanup(NCBI_CLEANUP_VERSION);
                ChangeMade(CCleanupChange::eAddNcbiCleanupObject);
                return;
            }
        }
    }
    // create new
    CRef<CSeqdesc> ncbi_cleanup_object( new CSeqdesc );
    CSeqdesc_Base::TUser& user = ncbi_cleanup_object->SetUser();
    user.UpdateNcbiCleanup(NCBI_CLEANUP_VERSION);
    seq_entry.SetDescr().Set().push_back( ncbi_cleanup_object );

    ChangeMade(CCleanupChange::eAddNcbiCleanupObject);
}

static
string
s_GetMiRNAProduct( const string &name )
{
    if ( NStr::StartsWith(name, "miRNA ") ) {
        return name.substr(6);
    } else if ( NStr::StartsWith(name, "microRNA ") ) {
        return name.substr(9);
    } else {
        if ( NStr::EndsWith(name, " miRNA") &&
            ! NStr::EndsWith(name, "precursor miRNA") )
        {
            return name.substr(0, name.length() - 6);
        }
        else if (  NStr::EndsWith( name, " microRNA") &&
            ! NStr::EndsWith(name, "precursor microRNA") )
        {
            return name.substr(0, name.length() - 9 );
        }
    }
    return kEmptyStr;
}

bool s_FixRNAOtherByName(CSeq_feat& feat)
{
    if (!feat.IsSetData() || !feat.GetData().IsRna()) {
        return false;
    }

    CRNA_ref& rna = feat.SetData().SetRna();
    if (!rna.IsSetType() ||
        rna.GetType() != CRNA_ref::eType_other ||
        !rna.IsSetExt() ||
        !rna.GetExt().IsName()) {
        return false;
    }

    string rna_name = rna.GetExt().GetName();

    bool any_change = false;
            
    string miRNAproduct;
    if (s_IsNcrnaName(rna_name))
    {
        rna.SetType(CRNA_ref::eType_ncRNA);
        rna.SetExt().SetGen().SetClass(rna_name);
        any_change = true;
    } else if (NStr::Equal(rna_name, "ncRNA")) {
        rna.ResetExt();
        rna.SetType(CRNA_ref::eType_ncRNA);
        any_change = true;
    } else if (!(miRNAproduct = s_GetMiRNAProduct(rna_name)).empty())
    {
        rna.SetType(CRNA_ref::eType_ncRNA);
        rna.SetExt().SetGen().SetClass("miRNA");
        rna.SetExt().SetGen().SetProduct(miRNAproduct);
        any_change = true;
    } else if (NStr::Equal(rna_name, "tmRNA")) {
        rna.SetType(CRNA_ref::eType_tmRNA);
        any_change = true;
    } else if (NStr::Equal(rna_name, "misc_RNA")) {
        string remainder;
        rna.SetRnaProductName("", remainder);
        any_change = true;
    }
    return any_change;
}

bool s_FixncRNA(CSeq_feat& feat)
{
    if (!feat.IsSetData() || !feat.GetData().IsRna()) {
        return false;
    }

    bool any_change = false;
    CRNA_ref& rna = feat.SetData().SetRna();
    const CRNA_ref_Base::TType rna_type =
        (rna.IsSetType() ? rna.GetType() : NCBI_RNAREF(unknown));

    switch (rna_type) {
    case CRNA_ref::eType_snRNA:
    case CRNA_ref::eType_scRNA:
    case CRNA_ref::eType_snoRNA:
      {{

        string rna_type_name = CRNA_ref::GetRnaTypeName(rna_type);
        if (rna.IsSetExt() && rna.GetExt().IsName() &&
            !NStr::EqualNocase(rna.GetExt().GetName(), rna_type_name))
        {
            string rna_product = rna.GetExt().GetName();
            rna.SetExt().SetGen().SetProduct(rna_product);
        }
        rna.SetType(CRNA_ref::eType_ncRNA);
        rna.SetExt().SetGen().SetClass(rna_type_name);
        any_change = true;
        break;
      }}
    default:
        break;
    }

    if (feat.IsSetQual() &&
        (rna_type == CRNA_ref::eType_ncRNA ||
         rna_type == NCBI_RNAREF(other))) {
        auto& qual_list = feat.SetQual();
        CSeq_feat::TQual::iterator qual_iter = qual_list.begin();
        while (qual_iter != qual_list.end()) {
            string &qual = (*qual_iter)->SetQual();
            string &val = (*qual_iter)->SetVal();
            if (qual == "ncRNA_class") {
                string product = rna.GetRnaProductName();
                rna.SetType(CRNA_ref::eType_ncRNA);
                rna.SetExt().SetGen().SetClass(val);
                if (!NStr::IsBlank(product)) {
                    string remainder;
                    rna.SetRnaProductName(product, remainder);
                }
                any_change = true;
                qual_iter = qual_list.erase(qual_iter);
            }
            else {
                ++qual_iter;
            }
        }
        if (qual_list.empty()) {
            feat.ResetQual();
        }
    }

    if (rna_type == NCBI_RNAREF(ncRNA) && rna.IsSetExt() &&
        rna.GetExt().IsGen() && rna.GetExt().GetGen().IsSetClass() &&
        NStr::EqualNocase(rna.GetExt().GetGen().GetClass(), "antisense"))
    {
        rna.SetExt().SetGen().SetClass("antisense_RNA");
        any_change = true;
    }

    if (rna_type == NCBI_RNAREF(ncRNA)) {
        string product_name = rna.GetRnaProductName();
        if (NStr::Equal(product_name, "ncRNA")) {
            string remainder;
            rna.SetRnaProductName("", remainder);
            any_change = true;
        }
    }
    return any_change;
}

bool s_FixtmRNA(CSeq_feat& feat)
{
    if (!feat.IsSetData() || !feat.GetData().IsRna()) {
        return false;
    }

    bool any_change = false;
    CRNA_ref& rna = feat.SetData().SetRna();

    CRNA_ref::TType rna_type = (rna.IsSetType() ? rna.GetType() : CRNA_ref::eType_unknown);

    string product = rna.GetRnaProductName();

    if (feat.IsSetQual() && 
        (rna_type == CRNA_ref::eType_other || 
        rna_type == CRNA_ref::eType_tmRNA ||
        rna_type == CRNA_ref::eType_ncRNA)) {
        auto& qual_list = feat.SetQual();
        CSeq_feat::TQual::iterator qual_iter = qual_list.begin();
        while (qual_iter != qual_list.end()) {
            string &qual = (*qual_iter)->SetQual();
            string &val = (*qual_iter)->SetVal();
            if (qual == "tag_peptide") {
                if (rna_type == CRNA_ref::eType_other) {
                    rna.SetType(CRNA_ref::eType_tmRNA);
                }
                CRef<CRNA_qual> rna_qual(new CRNA_qual);
                rna_qual->SetQual(qual);
                rna_qual->SetVal(val);
                rna.SetExt().SetGen().SetQuals().Set().push_back(rna_qual);
                any_change = true;
                qual_iter = qual_list.erase(qual_iter);
            } else if (qual == "ncRNA_class" && rna_type == CRNA_ref::eType_tmRNA) {
                rna.SetExt().SetGen().SetClass(val);
                any_change = true;
                qual_iter = qual_list.erase(qual_iter);
            } else {
                ++qual_iter;
            }
        }
        if (qual_list.empty()) {
            feat.ResetQual();
        }
    }
    if (any_change) {
        string remainder;
        rna.SetRnaProductName(product, remainder);
    }
    if (rna_type == NCBI_RNAREF(tmRNA)) {
        string product_name = rna.GetRnaProductName();
        if (NStr::Equal(product_name, "tmRNA")) {
            string remainder;
            rna.SetRnaProductName("", remainder);
            any_change = true;
        }
    }
    return any_change;
}

bool CNewCleanup_imp::x_FixMiscRNA(CSeq_feat& feat)
{
    if (!feat.IsSetData() || !feat.GetData().IsRna()) {
        return false;
    }

    bool any_change = false;
    CRNA_ref& rna = feat.SetData().SetRna();
    if (!rna.IsSetType() || (rna.GetType() != CRNA_ref::eType_other && rna.GetType() != CRNA_ref::eType_miscRNA)) {
        return false;
    }

    if (rna.GetType() == CRNA_ref::eType_other) {
        rna.SetType(CRNA_ref::eType_miscRNA);
        any_change = true;
    }

    if (rna.IsSetExt() && rna.GetExt().IsName()) {
        string rna_name = rna.SetExt().SetName();
        if (rna_name != "ncRNA" &&
            rna_name != "tmRNA" &&
            rna_name != "misc_RNA")
        {
            string remainder;
            rna.SetRnaProductName(rna_name, remainder);
            if (!NStr::IsBlank(remainder)) {
                x_AddToComment(feat, remainder);
            }
            any_change = true;
        }
    }
    string product_name = rna.GetRnaProductName();
    if (NStr::IsBlank(product_name) && feat.IsSetQual())
    {
        auto& qual_list = feat.SetQual();
        CSeq_feat::TQual::iterator qual_iter = qual_list.begin();
        while (qual_iter != qual_list.end()) {
            string &qual = (*qual_iter)->SetQual();
            string &val = (*qual_iter)->SetVal();
            if (qual == "product") {
                // e.g. "its1" to "internal transcribed spacer 1"
                CNewCleanup_imp::TranslateITSName(val);
                rna.SetExt().SetGen().SetProduct(val);
                any_change = true;
                qual_iter = qual_list.erase(qual_iter);
            }
            else {
                ++qual_iter;
            }
        }
    }
    return any_change;
}


void CNewCleanup_imp::x_ModernizeRNAFeat(CSeq_feat& feat)
{
    if (!feat.IsSetData() || !feat.GetData().IsRna()) {
        return;
    }

    if (s_FixRNAOtherByName(feat)) {
        ChangeMade(CCleanupChange::eChangeRNAref);
    }
    if (s_FixncRNA(feat)) {
        ChangeMade(CCleanupChange::eChangeRNAref);
    }
    if (s_FixtmRNA(feat)) {
        ChangeMade(CCleanupChange::eChangeRNAref);
    }
    if (x_FixMiscRNA(feat)) {
        ChangeMade(CCleanupChange::eChangeRNAref);
    }


}

static bool s_IsEmpty(const CTrna_ext& trna)
{
    if (trna.IsSetAa()) return false;
    if (trna.IsSetCodon() && trna.GetCodon().size() > 0) return false;
    if (trna.IsSetAnticodon()) return false;
    return true;
}

void CNewCleanup_imp::RnaFeatBC (
    CRNA_ref& rna,
    CSeq_feat& seq_feat
)

{
    if (!rna.IsSetType() || rna.GetType() == CRNA_ref::eType_unknown) {
        rna.SetType(CRNA_ref::eType_other);
        ChangeMade(CCleanupChange::eChangeRNAref);
    }
    // move rna.pseudo to feat.pseudo
    if ( FIELD_IS_SET(rna, Pseudo) ) {
        SET_FIELD(seq_feat, Pseudo, true);
        RESET_FIELD(rna, Pseudo);
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }

    if ( rna.IsSetExt() &&
        rna.GetExt().IsTRNA() ) 
    {                
        CTrna_ext &tRNA = rna.SetExt().SetTRNA();
        x_SeqFeatTRNABC( seq_feat, tRNA );

        if( seq_feat.IsSetLocation() && 
            tRNA.IsSetAnticodon() &&
            tRNA.GetAnticodon().IsInt() ) 
        {
            const CSeq_id *loc_id = seq_feat.GetLocation().GetId();
            const CSeq_id *ac_id  = tRNA.GetAnticodon().GetId();
            if( loc_id && ac_id && loc_id->Compare( *ac_id ) == CSeq_id::e_YES ) {
                const ENa_strand loc_strand = seq_feat.GetLocation().GetStrand();
                const ENa_strand ac_strand = tRNA.GetAnticodon().GetStrand();
                if (loc_strand == eNa_strand_minus && ac_strand != eNa_strand_minus) {
                    tRNA.SetAnticodon().SetInt().SetStrand(eNa_strand_minus);
                    ChangeMade (CCleanupChange::eChangeAnticodon);
                }
            }
        }
    }

    // Add fMet-related comments
    if( (! FIELD_IS_SET(rna, Ext) || FIELD_IS(rna.GetExt(), Gen) ) &&
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(tRNA) ) && 
        STRING_FIELD_NOT_EMPTY(seq_feat, Comment) )
    {
        bool justTrnaText = false;
        string codon;
        char aa = s_ParseSeqFeatTRnaString( GET_FIELD(seq_feat, Comment), 
            &justTrnaText, codon, true );
        if( aa != '\0' ) {
            CRef<CTrna_ext> tRNA( new CTrna_ext );
            tRNA->SetAa().SetNcbieaa( aa );
            rna.SetExt().SetTRNA( *tRNA );
            ChangeMade(CCleanupChange::eChange_tRna);
            if (justTrnaText) {
                if ( GET_FIELD(seq_feat, Comment) != "fMet" &&
                     GET_FIELD(seq_feat, Comment) != "fMet tRNA" &&
                     GET_FIELD(seq_feat, Comment) != "fMet-tRNA" ) {
                        RESET_FIELD( seq_feat, Comment );
                        ChangeMade(CCleanupChange::eRemoveComment);
                } else {
                    SET_FIELD( seq_feat, Comment, "fMet" );
                    ChangeMade(CCleanupChange::eChangeComment);
                }
            }
        }
    }

    // "S ribosomal RNA" logic
    if ( ! FIELD_IS_SET(rna, Ext) && 
        STRING_FIELD_NOT_EMPTY(seq_feat, Comment) &&
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(rRNA) ) ) 
    {
        const size_t comment_len = GET_FIELD(seq_feat, Comment).length();
        if (comment_len > 15 && comment_len < 20) {
            if ( NStr::EndsWith(GET_FIELD(seq_feat, Comment), "S ribosomal RNA", NStr::eNocase) ) {
                rna.SetExt().SetName( GET_FIELD(seq_feat, Comment) );
                ChangeMade(CCleanupChange::eChangeRNAref);
                RESET_FIELD(seq_feat, Comment);
                ChangeMade(CCleanupChange::eRemoveComment);
            }
        } else if (comment_len > 6 && comment_len < 20) {
            if ( NStr::EndsWith(GET_FIELD(seq_feat, Comment), "S rRNA", NStr::eNocase) ) {
                rna.SetExt().SetName( GET_FIELD(seq_feat, Comment) );
                ChangeMade(CCleanupChange::eChangeRNAref);
                RESET_FIELD(seq_feat, Comment);
                ChangeMade(CCleanupChange::eRemoveComment);
            }
        }
    }

    // mRNA logic
    if( ! FIELD_IS_SET(rna, Ext) &&
        STRING_FIELD_NOT_EMPTY(seq_feat, Comment) &&
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(mRNA) ) ) 
    {
        if ( NStr::EndsWith( GET_FIELD(seq_feat, Comment), " RNA",  NStr::eNocase ) ||
             NStr::EndsWith( GET_FIELD(seq_feat, Comment), " mRNA", NStr::eNocase ) )
        {
            rna.SetExt().SetName( GET_FIELD(seq_feat, Comment) );
            ChangeMade(CCleanupChange::eChangeRNAref);
            RESET_FIELD(seq_feat, Comment);
            ChangeMade(CCleanupChange::eRemoveComment);
        }
    }

    // ITS logic
    if( FIELD_EQUALS( rna, Type, NCBI_RNAREF(other)) || 
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(miscRNA) ) ) 
    {
        
        if ( !rna.IsSetExt()) {
            if ( seq_feat.IsSetComment() && 
                 (IsInternalTranscribedSpacer(seq_feat.GetComment()) || 
                  TranslateITSName(seq_feat.SetComment()))) {
                rna.SetExt().SetName(seq_feat.GetComment());
                seq_feat.ResetComment();
                ChangeMade(CCleanupChange::eChangeITS);
            }
        } else {
            auto& ext = rna.SetExt();
            if (ext.IsName()) {
                if (IsInternalTranscribedSpacer(ext.GetName()) ||
                    TranslateITSName(ext.SetName())) {
                    ext.SetName(ext.GetName());
                    ChangeMade(CCleanupChange::eChangeITS);
                }
            }
            else if (ext.IsGen() && ext.GetGen().IsSetProduct()) {
                if (TranslateITSName(rna.SetExt().SetGen().SetProduct())) {
                    ChangeMade(CCleanupChange::eChangeITS);
                }
            }
        }
    }

    // if RNA is type "tRNA" and ext.tRNA is set, remove any feat.comments which
    // are redundant (e.g. comment is "aa: Alanine", when alanine is what the tRNA encodes)
    if( STRING_FIELD_NOT_EMPTY(seq_feat, Comment) && 
        FIELD_EQUALS( rna, Type, NCBI_RNAREF(tRNA) ) &&
        FIELD_IS_SET_AND_IS(rna, Ext, TRNA) && 
        FIELD_IS_SET(rna.GetExt().GetTRNA(), Aa) )
    {
        // extract the part of the comment we care about
        string::size_type comment_start_pos = 0;
        if( NStr::StartsWith(GET_FIELD(seq_feat, Comment), "aa:") ) {
            comment_start_pos += 3; // 3 is len of "aa:"
        }
        comment_start_pos = GET_FIELD(seq_feat, Comment).find_first_not_of(" ", comment_start_pos);
        if( string::npos == comment_start_pos ) {
            comment_start_pos = GET_FIELD(seq_feat, Comment).length();
        }
        const string comment = GET_FIELD(seq_feat, Comment).substr(comment_start_pos);

        // convert to ncbieaa to standardize it
        const char aa = s_ConvertTrnaAaToLetter( rna.GetExt().GetTRNA().GetAa(), CSeqUtil::e_Ncbieaa );

        if( comment.length() == 1 && comment[0] == aa ) {
            RESET_FIELD(seq_feat, Comment);
            ChangeMade(CCleanupChange::eChangeComment);
        } else {
            // find the letter, 3-letter, and full name of the given aa (amino acid)
            CAminoAcidCharToSymbol::const_iterator aa_iter = sm_TrnaInverseKeys.lower_bound(aa);
            CAminoAcidCharToSymbol::const_iterator aa_end  = sm_TrnaInverseKeys.upper_bound(aa);
            for( ; aa_iter != aa_end ; ++aa_iter ) {
                const string &a_name = aa_iter->second;
                if (comment != a_name) continue;
                if ( aa == 'M' ) {
                    if ( ! NStr::EqualNocase(a_name, "fMet") && ! NStr::EqualNocase(a_name, "iMet") ) {
                        RESET_FIELD(seq_feat, Comment);
                        ChangeMade(CCleanupChange::eChangeComment);
                        break;
                    }
                } else if ( aa == 'I' ) {
                    if ( ! NStr::EqualNocase(a_name, "Ile2") ) {
                        RESET_FIELD(seq_feat, Comment);
                        ChangeMade(CCleanupChange::eChangeComment);
                        break;
                    }
                } else {
                    RESET_FIELD(seq_feat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                    break;
                }
            }
        }
    }

    // remove feat.comment if redundant with parts of ext.gen
    if( STRING_FIELD_NOT_EMPTY(seq_feat, Comment) &&
        FIELD_IS_SET_AND_IS(rna, Ext, Gen) )
    {
        const string &comment = GET_FIELD(seq_feat, Comment);
        const CRNA_gen &gen = rna.GetExt().GetGen();
        if( FIELD_EQUALS(gen, Class, comment) || 
            FIELD_EQUALS(gen, Product, comment) ) 
        {
            RESET_FIELD(seq_feat, Comment);
            ChangeMade(CCleanupChange::eChangeComment);
        } else if (gen.IsSetQuals()) {
            const auto& genquals = gen.GetQuals().Get();
            for (auto qual_iter : genquals) {
                const CRNA_qual &rna_qual = *qual_iter;
                if( FIELD_EQUALS(rna_qual, Val, comment) ) {
                    RESET_FIELD(seq_feat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                    break;
                }
            }
        }
    }

    // if not tRNA and ext is tRNA and tRNA is empty, remove ext.tRNA
    if (rna.IsSetType() && 
        (rna.GetType() == CRNA_ref::eType_mRNA || rna.GetType() == CRNA_ref::eType_rRNA || rna.GetType() == CRNA_ref::eType_tRNA) &&
        rna.IsSetExt() && rna.GetExt().IsTRNA() &&
        s_IsEmpty(rna.GetExt().GetTRNA())) {
        rna.ResetExt();
        ChangeMade(CCleanupChange::eChangeRNAref);
    }

    x_ModernizeRNAFeat(seq_feat);

}

class CCodeBreakCompare
{
public:
    CCodeBreakCompare( const CSeq_loc &seq_feat_location, CRef<CScope> scope ) :
        m_Seq_feat_location(seq_feat_location), m_Scope(scope)
    {

    }

    bool operator()( const CRef<CCode_break> break1, const CRef<CCode_break> break2 ) const
    {
        // check for missing locs (shouldn't happen, since locations are mandatory)
        const bool has_loc1 = FIELD_IS_SET(*break1, Loc);
        const bool has_loc2 = FIELD_IS_SET(*break2, Loc);
        if( ! has_loc1 || ! has_loc2 ) {
            return (has_loc1 < has_loc2);
        }

        const CSeq_loc &loc1 = GET_FIELD(*break1, Loc);
        const CSeq_loc &loc2 = GET_FIELD(*break2, Loc);

        TSeqPos seq_pos1 =
            sequence::LocationOffset(m_Seq_feat_location, loc1,
            sequence::eOffset_FromStart,
            &*m_Scope);
        TSeqPos seq_pos2 =
            sequence::LocationOffset(m_Seq_feat_location, loc2,
            sequence::eOffset_FromStart,
            &*m_Scope);

        return ( seq_pos1 < seq_pos2 );
    }
private:
    const CSeq_loc &m_Seq_feat_location;
    mutable CRef<CScope> m_Scope;
};

class CCodeBreakEqual 
{
public:
    CCodeBreakEqual( CRef<CScope> scope ) : 
        m_Scope( scope ) { }

    bool operator()( const CRef<CCode_break> break1, const CRef<CCode_break> break2 ) const
    {
        // check for missing locs (shouldn't happen, since locations are mandatory)
        const bool has_loc1 = FIELD_IS_SET(*break1, Loc);
        const bool has_loc2 = FIELD_IS_SET(*break2, Loc); 
        if( has_loc1 != has_loc2 ) {
            return false;
        }

        const CSeq_loc &loc1 = GET_FIELD(*break1, Loc);
        const CSeq_loc &loc2 = GET_FIELD(*break2, Loc);

        if( sequence::eSame != sequence::Compare(loc1, loc2, &*m_Scope, sequence::fCompareOverlapping) ) {
            return false;
        }

        const bool aa_set1 = FIELD_IS_SET(*break1, Aa);
        const bool aa_set2 = FIELD_IS_SET(*break2, Aa);
        if( aa_set1 != aa_set2 ) {
            return false;
        } else if( ! aa_set1 && ! aa_set2 ) {
            return true;
        }

        return GET_FIELD(*break1, Aa).Equals( GET_FIELD(*break2, Aa) );
    }

private:
    mutable CRef<CScope> m_Scope;
};

void CNewCleanup_imp::CdregionFeatBC (CCdregion& cds, CSeq_feat& seqfeat)
{
    // move the cdregion's xrefs to their destination protein
    x_MoveCdregionXrefsToProt( cds, seqfeat );

    // make code-break's location on minus strand if seq-feat's location is
    // on minus strand(and both are on the same seqid)
    if( FIELD_IS_SET(seqfeat, Location) ) {
        const ENa_strand seqfeat_loc_strand = GET_FIELD(seqfeat, Location).GetStrand();
        const CSeq_id* seqfeat_loc_id = GET_FIELD(seqfeat, Location).GetId();
        if( (seqfeat_loc_strand == eNa_strand_minus) && (seqfeat_loc_id != NULL) ) {
            EDIT_EACH_CODEBREAK_ON_CDREGION(code_break_iter, cds) {
                CCode_break &code_break = **code_break_iter;
                if( FIELD_IS_SET(code_break, Loc) ) {
                    const ENa_strand code_break_strand = GET_FIELD(code_break, Loc).GetStrand();
                    const CSeq_id* code_break_id = GET_FIELD(code_break, Loc).GetId();
                    if( (code_break_strand != eNa_strand_minus) && (code_break_id != NULL) && 
                        GET_FIELD(code_break, Loc).IsInt() &&
                        code_break_id->Compare(*seqfeat_loc_id) == CSeq_id::e_YES ) 
                    {
                        GET_MUTABLE(code_break, Loc).SetStrand(eNa_strand_minus);
                        ChangeMade( CCleanupChange::eChangeStrand );
                    }
                }
            }
        }
    }

    // sort/uniq code breaks
    CCodeBreakCompare code_break_compare( seqfeat.GetLocation(), m_Scope );
    if( ! CODEBREAK_ON_CDREGION_IS_SORTED(cds, code_break_compare) ) {
        SORT_CODEBREAK_ON_CDREGION(cds, code_break_compare);
        ChangeMade(CCleanupChange::eChangeCodeBreak);
    }

    CCodeBreakEqual code_break_equal( m_Scope );
    if( ! CODEBREAK_ON_CDREGION_IS_UNIQUE(cds, code_break_equal) ) {
        UNIQUE_CODEBREAK_ON_CDREGION(cds, code_break_equal);
        ChangeMade(CCleanupChange::eChangeCodeBreak);
    }
    if (cds.IsSetCode_break() && cds.GetCode_break().empty()) {
        cds.ResetCode_break();
        ChangeMade(CCleanupChange::eChangeCodeBreak);
    }

    // check if comment is redundant due to selenocysteine or pyrrolysine
    if( GET_STRING_FLD_OR_BLANK(seqfeat, Comment) == "selenocysteine" || 
        GET_STRING_FLD_OR_BLANK(seqfeat, Comment) == "pyrrolysine" )
    {
        const string & comment = GET_STRING_FLD_OR_BLANK(seqfeat, Comment);
        FOR_EACH_CODEBREAK_ON_CDREGION(code_break_iter, cds) {
            const CCode_break &code_break = **code_break_iter;
            // We only check ncbieaa since that seems to be how the C
            // toolkit behaves.  Maybe in the future, we can also check for
            // ncbi8aa, ncbistdaa, etc.
            if( FIELD_IS_SET_AND_IS(code_break, Aa, Ncbieaa) ) {
                if( GET_FIELD(code_break.GetAa(), Ncbieaa) == 'U' && 
                    comment == "selenocysteine" ) 
                {
                    RESET_FIELD(seqfeat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                } else if( GET_FIELD(code_break.GetAa(), Ncbieaa) == 'O' && 
                    comment == "pyrrolysine" ) 
                {
                    RESET_FIELD(seqfeat, Comment);
                    ChangeMade(CCleanupChange::eChangeComment);
                }
            }
        }
    }

    // check if comment redund with e.c. on product prot
    if (m_Scope && x_IsCommentRedundantWithEC(seqfeat, *m_Scope)) {
        seqfeat.ResetComment();
        ChangeMade(CCleanupChange::eChangeComment);
    }
}


bool CNewCleanup_imp::x_IsCommentRedundantWithEC(const CSeq_feat& seqfeat, CScope& scope)
{
    if (!seqfeat.IsSetComment() || !seqfeat.IsSetProduct()) {
        return false;
    }
    CBioseq_Handle product_bioseq = scope.GetBioseqHandle(seqfeat.GetProduct());
    if (product_bioseq) {
        CConstRef<CBioseq> pseq = product_bioseq.GetCompleteBioseq();
        if (pseq && pseq->IsSetAnnot()) {
            for (auto ait : pseq->GetAnnot()) {
                if (ait->IsFtable()) {
                    for (auto fit : ait->GetData().GetFtable()) {
                        if (fit->IsSetData() &&
                            fit->GetData().GetSubtype() == CSeqFeatData::eSubtype_prot &&
                            fit->GetData().GetProt().IsSetEc()) {
                            for (auto ec : fit->GetData().GetProt().GetEc()) {
                                if (NStr::EqualNocase(ec, seqfeat.GetComment())) {
                                    return true;
                                }
                            }
                        }
                    }
                }
            }
        }
    }
    return false;
}


bool CNewCleanup_imp::x_InGpsGenomic( const CSeq_feat& seqfeat )
{
    if( ! FIELD_IS_SET(seqfeat, Location) ) {
        return false;
    }
    const CSeq_id *loc_seq_id = GET_FIELD(seqfeat, Location).GetId();
    if( loc_seq_id == NULL ) {
        return false;
    }
    CBioseq_Handle bioseq_handle = m_Scope->GetBioseqHandle( *loc_seq_id );
    if( ! bioseq_handle ) {
        return false;
    }
    CBioseq_set_Handle parent_bioseq_set_handle = bioseq_handle.GetParentBioseq_set();
    for( ; parent_bioseq_set_handle; 
           parent_bioseq_set_handle = parent_bioseq_set_handle.GetParentBioseq_set() )
    {
        if( ! FIELD_IS_SET(parent_bioseq_set_handle, Class) ) {
            return false;
        }
        if( GET_FIELD(parent_bioseq_set_handle, Class) == CBioseq_set::eClass_nuc_prot ) {
            return false;
        } else if( GET_FIELD(parent_bioseq_set_handle, Class) == CBioseq_set::eClass_gen_prod_set ) {
            return true;
        } 
    }
    return false;
}

enum EMoveNonDuplicatedItemsOpt {
    eMoveNonDuplicatedItemsOpt_ModifySource = 1,
    eMoveNonDuplicatedItemsOpt_DoNotModifySource
};

// For example:
// Let's say that dest is {"abc", "123", "xyz"}
// and src is {"456", "123", "884", "abc"}, then afterwards they will be:
// src: {"123", "abc"} (holds the items we couldn't move over)
// dest: {"abc", "123", "xyz", "456", "884"}
// That's for eMoveNonDuplicatedItemsOpt_ModifySource; if 
// eMoveNonDuplicatedItemsOpt_DoNotModifySource is set, nothing happens.
template< typename TDest, typename TSrc, typename TLessThan >
static
void s_MoveNonDuplicatedItems( TDest &dest, TSrc &src, 
    const TLessThan &less_than, 
    EMoveNonDuplicatedItemsOpt opt )
{
    // first, create a set containing whatever the destination contains for easy
    // lookup later
    set<typename TDest::value_type, TLessThan> dest_items_set( less_than );
    copy( dest.begin(), dest.end(),
          inserter( dest_items_set, dest_items_set.end() ) );

    // holds the items that we couldn't move over
    TSrc new_src;

    typename TSrc::iterator iter = src.begin();
    for( ; iter != src.end(); ++iter ) {
        // only add items not already in dest
        if( dest_items_set.find(*iter) == dest_items_set.end() ) {
            dest.push_back( *iter );
            dest_items_set.insert(*iter);
        } else {
            if( opt == eMoveNonDuplicatedItemsOpt_ModifySource ) {
                new_src.push_back( *iter );
            }
        }
    }

    // some (maybe all?) of srcs items were moved over and deleted,
    // but "new_src" contains the ones we didn't move over.
    // Note that swap should be faster than assignment.
    if( opt == eMoveNonDuplicatedItemsOpt_ModifySource ) {
        src.swap( new_src );
    }
}

// move parts of cds_prot_ref to prot_ref
void s_CopyProtXrefToProtFeat( CProt_ref &prot_ref, CProt_ref &cds_prot_ref )
{
    // move the Db field over
    if( FIELD_IS_SET(cds_prot_ref, Db) ) {
        copy( GET_FIELD(cds_prot_ref, Db).begin(), GET_FIELD(cds_prot_ref, Db).end(),
            back_inserter( GET_MUTABLE(prot_ref, Db) ) );
        RESET_FIELD(cds_prot_ref, Db);
    }

    // move the Name field over
    // (Here, we only move over names which don't already exist in the destination )
    if( FIELD_IS_SET(cds_prot_ref, Name) ) {
      s_MoveNonDuplicatedItems( GET_MUTABLE(prot_ref, Name), GET_MUTABLE(cds_prot_ref, Name), PNocase(), eMoveNonDuplicatedItemsOpt_DoNotModifySource );
    }

    // move the Desc field over (but only if it differs from ours )
    if( FIELD_IS_SET(cds_prot_ref, Desc) ) {
        const string &cds_desc = GET_FIELD(cds_prot_ref, Desc);
        if( ! FIELD_IS_SET(prot_ref, Desc) ) {
            SET_FIELD(prot_ref, Desc, cds_desc);
            RESET_FIELD(cds_prot_ref, Desc);
        } else if ( GET_FIELD(prot_ref, Desc) != cds_desc ) {
            SET_FIELD(prot_ref, Desc, GET_FIELD(prot_ref, Desc) + "; " + cds_desc );
        }
    }

    // move the Ec field over
    if( FIELD_IS_SET(cds_prot_ref, Ec) ) {
      s_MoveNonDuplicatedItems( GET_MUTABLE(prot_ref, Ec), GET_MUTABLE(cds_prot_ref, Ec), PNocase(), eMoveNonDuplicatedItemsOpt_DoNotModifySource );
    }

    // move the Activity field over
    if( FIELD_IS_SET(cds_prot_ref, Activity) ) {
      s_MoveNonDuplicatedItems( GET_MUTABLE(prot_ref, Activity), GET_MUTABLE(cds_prot_ref, Activity), PNocase(), eMoveNonDuplicatedItemsOpt_DoNotModifySource );
    }
}

void CNewCleanup_imp::x_MoveCdregionXrefsToProt (CCdregion& cds, CSeq_feat& seqfeat)
{
    if( !seqfeat.IsSetXref() || ! seqfeat.IsSetProduct() ) {
        return;
    }
    if( x_InGpsGenomic(seqfeat) ) {
        return;
    }

    // get the protein

    // get protein sequence for product
    CBioseq_Handle product_bioseq = m_Scope->GetBioseqHandle(seqfeat.GetProduct());
    if (product_bioseq) {
        CConstRef<CBioseq> pseq = product_bioseq.GetCompleteBioseq();
        if (pseq && pseq->IsSetAnnot()) {
            for (auto ait : pseq->GetAnnot()) {
                if (ait->IsFtable()) {
                    for (auto fit : ait->GetData().GetFtable()) {
                        if (fit->IsSetData() &&
                            fit->GetData().GetSubtype() == CSeqFeatData::eSubtype_prot) {
                            auto& xref_list = seqfeat.SetXref();
                            auto xref = xref_list.begin();
                            while (xref != xref_list.end()) {
                                if ((*xref)->IsSetData() && (*xref)->GetData().IsProt()) {
                                    CRef<CSeq_feat> pfeat(const_cast<CSeq_feat *>(fit.GetPointer()));
                                    ProtrefBC(pfeat->SetData().SetProt());
                                    ProtrefBC((*xref)->SetData().SetProt());
                                    s_CopyProtXrefToProtFeat(pfeat->SetData().SetProt(),
                                        (*xref)->SetData().SetProt());
                                    xref = xref_list.erase(xref);
                                    ChangeMade(CCleanupChange::eMoveToProtXref);
                                } else {
                                    ++xref;
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}


void CNewCleanup_imp::DeltaExtBC( CDelta_ext & delta_ext, CSeq_inst &seq_inst )
{
    // remove zero-length seq-literals
    if( FIELD_EQUALS( seq_inst, Repr, CSeq_inst::eRepr_delta ) ) {
        EDIT_EACH_DELTASEQ_IN_DELTAEXT( delta_seq_iter, delta_ext ) {
            CDelta_seq &delta_seq = **delta_seq_iter;
            if( delta_seq.IsLiteral() ) {
                const CSeq_literal &the_literal = delta_seq.GetLiteral();
                if( FIELD_IS_SET(the_literal, Seq_data) &&
                    FIELD_EQUALS(the_literal, Length, 0) && 
                    the_literal.GetSeq_data().IsIupacna() ) 
                {
                    ERASE_DELTASEQ_IN_DELTAEXT( delta_seq_iter, delta_ext );
                    ChangeMade(CCleanupChange::eCleanDeltaExt);
                }
            }
        }
    }
}

void CNewCleanup_imp::UserObjectBC( CUser_object &user_object )
{    
    if (CCleanup::CleanupUserObject(user_object)) {
        ChangeMade(CCleanupChange::eCleanUserObjectOrField);
    }
}

static int s_PcrPrimerCompare( 
    const CRef<CPCRPrimer> &p1, const CRef<CPCRPrimer> &p2 )
{
    if( p1.IsNull() || p2.IsNull() ) {
        return p2.IsNull() - p1.IsNull();
    }

    const string & name1 = ( p1->IsSetName() ? p1->GetName().Get() : kEmptyStr );
    const string & name2 = ( p2->IsSetName() ? p2->GetName().Get() : kEmptyStr );
    const int name_comparison = NStr::CompareCase(name1, name2);
    if( name_comparison != 0 ) {
        return name_comparison;
    }

    const string & seq1 = ( p1->IsSetSeq() ? p1->GetSeq().Get() : kEmptyStr );
    const string & seq2 = ( p2->IsSetSeq() ? p2->GetSeq().Get() : kEmptyStr );
    const int seq_comparison = NStr::CompareCase(seq1, seq2);
    return seq_comparison;
}

class CPcrPrimerRefLessThan {
public:

    bool operator()(
        const CRef<CPCRPrimer> &p1, const CRef<CPCRPrimer> &p2 ) const
    {
        return ( s_PcrPrimerCompare(p1, p2) < 0 );
    }
};

class CPCRPrimerRefEqual {
public:
    bool operator()( 
        const CRef<CPCRPrimer> & p1, const CRef<CPCRPrimer> & p2 ) const
    {
        return (0 == s_PcrPrimerCompare(p1, p2) );
    }
};

void CNewCleanup_imp::x_PCRPrimerSetBC( CPCRPrimerSet &primer_set )
{
    CPCRPrimer *last = NULL;

    EDIT_EACH_PCRPRIMER_IN_PCRPRIMERSET( primer_iter, primer_set ) {
        CPCRPrimer &primer = **primer_iter;
        
        if( FIELD_IS_SET(primer, Seq) ) {
            string &seq = GET_MUTABLE(primer, Seq).Set();
            const string before = seq;
            x_CleanupStringMarkChanged(seq);
            CPCRPrimerSeq::Clean(seq);
            if( before != seq ) {
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
            if( seq.empty() ) {
                RESET_FIELD(primer, Seq);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if( FIELD_IS_SET(primer, Name) ) {
            string &name = GET_MUTABLE(primer, Name).Set();
            const string before = name;
            x_CleanupStringMarkChanged(name);
            x_CompressStringSpacesMarkChanged(name);
            if( before != name ) {
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
            if( name.empty() ) {
                RESET_FIELD(primer, Name);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if (last != NULL) {
            CPCRPrimer &lastprimer = *last;
            if (FIELD_IS_SET(lastprimer, Name) && FIELD_IS_SET(primer, Name)) {
                string &name1 = GET_MUTABLE(lastprimer, Name).Set();
                string &name2 = GET_MUTABLE(primer, Name).Set();
                if (NStr::EqualNocase(name1, name2)) {
                    if (! FIELD_IS_SET(lastprimer, Seq)) {
                        SET_FIELD(lastprimer, Seq, GET_FIELD(primer, Seq) );
                        RESET_FIELD(primer, Name);
                        RESET_FIELD(primer, Seq);
                        ChangeMade(CCleanupChange::eChangePCRPrimers);
                    } else if (! FIELD_IS_SET(primer, Seq)) {
                        RESET_FIELD(primer, Name);
                        ChangeMade(CCleanupChange::eChangePCRPrimers);
                    }
                }
            }
        }

        if( ! FIELD_IS_SET(primer, Name) && ! FIELD_IS_SET(primer, Seq) ) {
            ERASE_PCRPRIMER_IN_PCRPRIMERSET(primer_iter, primer_set);
            ChangeMade(CCleanupChange::eChangePCRPrimers);
        } else {
            last = *primer_iter;
        }
    }

    UNIQUE_WITHOUT_SORT_PCRPRIMER_IN_PCRPRIMERSET( primer_set, CPcrPrimerRefLessThan );

    REMOVE_IF_EMPTY_PCRPRIMER_IN_PCRPRIMERSET( primer_set );
}

void CNewCleanup_imp::x_CopyGBBlockDivToOrgnameDiv( CSeq_entry &seq_entry)
{
    if (!seq_entry.IsSetDescr()) {
        return;
    }
    auto& dset = seq_entry.SetDescr().Set();
    // find the "org" and find the "genbank"
    COrgName *orgname = NULL;
    CGB_block *gb_block = NULL;

    for (auto desc_iter : dset) {
        CSeqdesc &desc = *desc_iter;

        if( FIELD_IS(desc, Genbank) ) {
            gb_block = &desc.SetGenbank();
        } else if( FIELD_IS(desc, Org) && FIELD_IS_SET(desc.GetOrg(), Orgname) ) {
            orgname = &desc.SetOrg().SetOrgname();
        } else if( FIELD_IS(desc, Source) && FIELD_IS_SET(desc.GetSource(), Org) &&
            FIELD_IS_SET(desc.GetSource().GetOrg(), Orgname) )
        {
            orgname = &GET_MUTABLE(desc.SetSource().SetOrg(), Orgname);
        }
    }

    if( (NULL != orgname) && (NULL != gb_block) &&
        RAW_FIELD_IS_EMPTY_OR_UNSET(*orgname, Div) && 
        ! RAW_FIELD_IS_EMPTY_OR_UNSET(*gb_block, Div) )
    {
        SET_FIELD(*orgname, Div, GET_FIELD(*gb_block, Div) );
        ChangeMade(CCleanupChange::eChangeQualifiers);
    }
}

void CNewCleanup_imp::x_AuthListBCWithFixInitials( CAuth_list& al )
{
    CCleanup::CleanupAuthList( al, true );
}

void CNewCleanup_imp::x_PostProcessing(void)
{
    // convert muid to pmid, where possible
    if( ! m_MuidPubContainer.empty() ) {
        NON_CONST_ITERATE( TMuidPubContainer, pub_iter, m_MuidPubContainer ) {
            CPub &pub = **pub_iter;
            const TEntrezId muid = pub.GetMuid();
            
            // attempt to find that muid in the muid-to-pmid mapping created earlier
            TMuidToPmidMap::const_iterator map_iter = m_MuidToPmidMap.find(ENTREZ_ID_TO(int, muid));
            if( map_iter != m_MuidToPmidMap.end() ) {
                const TEntrezId pmid = ENTREZ_ID_FROM(int, map_iter->second);
                pub.SetPmid().Set(pmid);
                ChangeMade(CCleanupChange::eChangePublication);
            }
        }

        m_MuidPubContainer.clear();
    }

    // update cit-gens that pointed to obsolete pubs

    if( ! m_OldLabelToPubMap.empty() && ! m_PubToNewPubLabelMap.empty() &&
        ! m_SeqFeatCitPubContainer.empty() ) 
    {
        NON_CONST_ITERATE( TSeqFeatCitPubContainer, pub_iter, m_SeqFeatCitPubContainer ) {
            CPub &pub = **pub_iter;

            if( FIELD_IS(pub, Gen) && FIELD_IS_SET(pub.GetGen(), Cit) ) {
                CCit_gen &gen = pub.SetGen();
                const string &cit = gen.GetCit();

                TOldLabelToPubMap::const_iterator iter   = m_OldLabelToPubMap.lower_bound(cit);
                TOldLabelToPubMap::const_iterator finish = m_OldLabelToPubMap.upper_bound(cit);
                for( ; iter != finish; ++iter ) {
                    CRef<CPub> referenced_pub = iter->second;
                    const string &new_label = m_PubToNewPubLabelMap[referenced_pub];
                    if( ! new_label.empty() && cit != new_label ) {
                        gen.SetCit( new_label );
                        ChangeMade(CCleanupChange::eCleanCitonFeat);
                        break;
                    }
                }
            }
        }
    }

    // sometimes Seq-feat.cit.pub.gen items are cut off, so we try to fill them out
    if( ! m_PubdescCitGenLabelVec.empty() && ! m_SeqFeatCitPubContainer.empty() ) {
        NON_CONST_ITERATE( TSeqFeatCitPubContainer, pub_iter, m_SeqFeatCitPubContainer ) {
            CPub &pub = **pub_iter;

            if( pub.IsGen() ) {
                CCit_gen &gen = GET_MUTABLE(pub, Gen);
                if( ! RAW_FIELD_IS_EMPTY_OR_UNSET(gen, Cit) ) {
                    const string &cit = GET_FIELD(gen, Cit);
                    if( (cit.length() > 1) && NStr::EndsWith(cit, ">") ) {
                        string cit_copy = cit;
                        cit_copy.resize( cit_copy.length() - 1 ); // chop off final ">"
                        s_RegexpReplace( cit_copy, "Unpublished[ ]+", "Unpublished", 1 );

                        // check if the cit is a strict prefix of any of the cit-gen labels from before
                        ITERATE( TPubdescCitGenLabelVec, label_iter, m_PubdescCitGenLabelVec) {
                            const string &label = *label_iter;
                            if( (label.length() > cit_copy.length()) && NStr::StartsWith(label, cit_copy) ) {
                                gen.SetCit( label );
                                ChangeMade(CCleanupChange::eCleanCitonFeat);
                                break;
                            }
                        }
                    }
                }
            }
        }
    }
}


void CNewCleanup_imp::x_ClearEmptyDescr( CBioseq_set& bioseq_set )
{
    if (bioseq_set.IsSetDescr() && bioseq_set.GetDescr().Get().empty()) {
        bioseq_set.ResetDescr();
    }
}


void CNewCleanup_imp::x_ClearEmptyDescr( CBioseq& bioseq )
{
    if (bioseq.IsSetDescr() && bioseq.GetDescr().Get().empty()) {
        bioseq.ResetDescr();
    }
}

static bool IsBadSeqInstStrand(const CSeq_inst& inst, const CBioSource* bio_src)
{
    bool ret = false;
    if (inst.IsSetStrand() && inst.GetStrand() == CSeq_inst::eStrand_ss &&
        inst.IsSetMol() && inst.GetMol() == CSeq_inst::eMol_dna) {
        if (bio_src && bio_src->IsSetLineage()) {
            ret = true;
            if (NStr::FindNoCase(bio_src->GetLineage(), "virus", 0) != NPOS) {
                ret = false;
            } else if (bio_src->IsSetDivision() && NStr::EqualNocase(bio_src->GetDivision(), "SYN")) {
                ret = false;
            }
        }
    } else if (inst.IsSetStrand() && inst.GetStrand() == CSeq_inst::eStrand_not_set) {
        ret = true;
    }

    return ret;
}

void CNewCleanup_imp::x_RemoveSingleStrand(CBioseq& bioseq)
{
    // do not remove single-strandedness for patent sequences
    bool is_patent = false;
    const auto& idset = bioseq.GetId();
    for (auto id : idset) {
        if (id->IsPatent()) {
            is_patent = true;
            break;
        }
    }
    if (is_patent) {
        return;
    }

    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(bioseq);

    if (bioseq.IsSetInst() && IsBadSeqInstStrand(bioseq.GetInst(), sequence::GetBioSource(bsh))) {
        bioseq.SetInst().ResetStrand();
        ChangeMade(CCleanupChange::eChangeBioseqInst);
    }
}

void CNewCleanup_imp::x_NotePubdescOrAnnotPubs( 
    const CPub_equiv &pub_equiv )
{
    int muid = 0;
    int pmid = 0;

    x_NotePubdescOrAnnotPubs_RecursionHelper( pub_equiv, muid, pmid );

    // If a pub-equiv contains a muid and pmid, we assume they're 
    // equivalent.
    if( (muid > 0) && (pmid > 0) ) {
        m_MuidToPmidMap[muid] = pmid;
    }
}

void CNewCleanup_imp::x_NotePubdescOrAnnotPubs_RecursionHelper(
    const CPub_equiv &pub_equiv, int &muid, int &pmid )
{
    FOR_EACH_PUB_ON_PUBEQUIV(pub_iter, pub_equiv) {
        const CPub &pub = **pub_iter;
        switch( pub.Which() ) {
        case NCBI_PUB(Muid):
            muid = ENTREZ_ID_TO(int, pub.GetMuid());
            break;
        case NCBI_PUB(Pmid):
            pmid = ENTREZ_ID_TO(int, pub.GetPmid().Get());
            break;
        case NCBI_PUB(Gen): 
            {
                const CCit_gen &gen = pub.GetGen();
                if( gen.IsSetCit() || gen.IsSetJournal() || gen.IsSetDate() || gen.IsSetSerial_number() ) {
                    m_PubdescCitGenLabelVec.push_back( kEmptyStr );
                    string &label = m_PubdescCitGenLabelVec.back();
                    pub.GetLabel( &label, CPub::eContent, true );
                }
            }
            break;
        case NCBI_PUB(Equiv):
            x_NotePubdescOrAnnotPubs_RecursionHelper( pub.GetEquiv(), muid, pmid );
            break;
        default:
            break;
        }
    }
}

void CNewCleanup_imp::x_RememberPubOldLabel( CPub &pub )
{
    string old_label;
    pub.GetLabel( &old_label, CPub::eContent, true);

    m_OldLabelToPubMap.insert( TOldLabelToPubMap::value_type(old_label, CRef<CPub>(&pub)) );
}

void CNewCleanup_imp::x_RememberMuidThatMightBeConvertibleToPmid( int &muid, CPub &pub )
{
    // ignore the "muid" arg; it's just so we only add muid pubs to the container

    m_MuidPubContainer.push_back( CRef<CPub>(&pub) );
}

void CNewCleanup_imp::x_RememberSeqFeatCitPubs( CPub &pub )
{
    switch( pub.Which() ) {
    case NCBI_PUB(Equiv):
        // recurse into equivs
        EDIT_EACH_PUB_ON_PUBEQUIV( pub_iter, GET_MUTABLE(pub, Equiv) ) {
            x_RememberSeqFeatCitPubs( **pub_iter );
        }
        break;
    default:
        m_SeqFeatCitPubContainer.push_back( CRef<CPub>(&pub) );
        break;
    }
}

void CNewCleanup_imp::x_DecodeXMLMarkChanged( std::string & str )
{
    if (CCleanup::DecodeXMLMarkChanged(str)) {
        ChangeMade(CCleanupChange::eDecodeXML);
    }
}


void CNewCleanup_imp::x_SortSeqDescs( CSeq_entry & seq_entry )
{
    if (seq_entry.IsSetDescr() &&
        CCleanup::NormalizeDescriptorOrder(seq_entry.SetDescr())) {
        ChangeMade( CCleanupChange::eMoveDescriptor );
    }
}



void CNewCleanup_imp::x_RemoveDupBioSource( CBioseq & bioseq )
{
}

void CNewCleanup_imp::x_RemoveDupBioSource( CBioseq_set & bioseq_set )
{
    if (!bioseq_set.IsSetDescr() || !bioseq_set.IsSetSeq_set()) {
        return;
    }
    for (auto it : bioseq_set.GetDescr().Get()) {
        if (it->IsSource()) {
            for (auto se : bioseq_set.SetSeq_set()) {
                x_RemoveDupBioSource(*se, it->GetSource());
            }
        }
    }
}


struct SMatchSrc {
    const CBioSource& m_Src;
    bool operator()(CRef<CSeqdesc> desc) {
        return (desc->IsSource() && desc->GetSource().Equals(m_Src));
    }
};
void CNewCleanup_imp::x_RemoveDupBioSource(CSeq_entry& se, const CBioSource& src)
{
    if (se.IsSetDescr()) {
        SMatchSrc matcher{ src };
        auto dset = se.SetDescr().Set();
        auto before = dset.size();
        dset.erase(std::remove_if(dset.begin(), dset.end(), matcher), dset.end());
        if (dset.size() != before) {
            ChangeMade(CCleanupChange::eRemoveDupBioSource);
        }
    }
    if (se.IsSet() && se.GetSet().IsSetSeq_set()) {
        for (auto sub : se.SetSet().SetSeq_set()) {
            x_RemoveDupBioSource(*sub, src);
        }
    }
}



void CNewCleanup_imp::x_RemoveDupPubs(CSeq_descr & descr)
{
    if (CCleanup::RemoveDuplicatePubs(descr)) {
        ChangeMade(CCleanupChange::eRemoveDescriptor);
    }
}

void CNewCleanup_imp::x_FixStructuredCommentKeywords( CBioseq & bioseq )
{
    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(bioseq);
    CBioseq_EditHandle beh(bsh);
    vector<string> controlled_keywords = CComment_rule::GetKeywordList();
    vector<string> original_keywords;

    auto& dset = beh.SetDescr().Set();
    if (!dset.empty()) {
        CBioseq::TDescr::Tdata::iterator it = dset.begin();
        while (it != dset.end()) {
            CSeqdesc& desc = **it;
            if (desc.Which() != CSeqdesc::e_Genbank) {
                ++it;
                continue;
            }
            CGB_block& gb_block = desc.SetGenbank();
            EDIT_EACH_KEYWORD_ON_GENBANKBLOCK(k_itr, gb_block) {
                original_keywords.push_back(*k_itr);
                FOR_EACH_STRING_IN_VECTOR(s_itr, controlled_keywords) {
                    if (NStr::EqualNocase(*k_itr, *s_itr)) {
                        ERASE_KEYWORD_ON_GENBANKBLOCK(k_itr, gb_block);
                        break;
                    }
                }
            }
            if (gb_block.IsSetKeywords() && gb_block.GetKeywords().size() == 0) {
                gb_block.ResetKeywords();
            }
            if (gb_block.IsEmpty()) {
                it = dset.erase(it);
            } else {
                ++it;
            }
        }
    }

    vector<string> new_keywords;

    for (CSeqdesc_CI di(bsh, CSeqdesc::e_User); di; ++di) {
        const CUser_object& usr = di->GetUser();
        if ( ! CComment_rule::IsStructuredComment (usr) ) continue;
        string prefix = CComment_rule::GetStructuredCommentPrefix (usr);
        if (!prefix.empty()) {
            CConstRef<CComment_set> comment_rules = CComment_set::GetCommentRules();
            if (comment_rules) {
                CConstRef<CComment_rule> ruler = comment_rules->FindCommentRuleEx(prefix);
                if (ruler) {
                    const CComment_rule& rule = *ruler;
                    CComment_rule::TErrorList errors = rule.IsValid(usr);
                    if (errors.size() == 0) {
                        string kywd = CComment_rule::KeywordForPrefix( prefix );
                        if (! kywd.empty()) {
                            new_keywords.push_back(kywd);
                        }
                    }
                }
            }
        }
    }
    vector<string> final_keywords;
    if (new_keywords.size() > 0) {
        CGB_block *gb_block = NULL;
        for (auto itr : dset) {
            CSeqdesc& desc = *itr;
            if ( desc.Which() != CSeqdesc::e_Genbank ) continue;
            gb_block = &desc.SetGenbank();
        }
        if (! gb_block) {
            CRef<CSeqdesc> new_desc ( new CSeqdesc );
            gb_block = &(new_desc->SetGenbank());
            bioseq.SetDescr().Set().push_back( new_desc );
        }
        if (gb_block->IsSetKeywords()) {
            FOR_EACH_KEYWORD_ON_GENBANKBLOCK (k_itr, *gb_block) {
                final_keywords.push_back(*k_itr);
            }
        }
        FOR_EACH_STRING_IN_VECTOR ( n_itr, new_keywords ) {
            ADD_KEYWORD_TO_GENBANKBLOCK (*gb_block, *n_itr);
            final_keywords.push_back(*n_itr);
        }
    }
    bool any_change = false;
    vector<string>::iterator orig_k = original_keywords.begin();
    vector<string>::iterator final_k = final_keywords.begin();
    while (!any_change && orig_k != original_keywords.end() && final_k != final_keywords.end()) {
        if (!NStr::Equal(*orig_k, *final_k)) {
            any_change = true;
        }
        ++orig_k;
        ++final_k;
    }
    if (orig_k != original_keywords.end() || final_k != final_keywords.end()) {
        any_change = true;
    }
    if (any_change) {
        ChangeMade(CCleanupChange::eChangeKeywords);
    }
    if (dset.empty()) {
        beh.ResetDescr();
    }
}

void CNewCleanup_imp::x_RemoveProtDescThatDupsProtName( CProt_ref & prot )
{
    // remove prot desc if it matches any prot name
    if (prot.IsSetDesc()) {
        const CProt_ref::TDesc& desc = prot.GetDesc();
        FOR_EACH_NAME_ON_PROTREF (it, prot) {
            if (NStr::EqualNocase(desc, *it)) {
                prot.ResetDesc();
                ChangeMade(CCleanupChange::eChangeQualifiers);
                break;
            }
        }
    }
}

void CNewCleanup_imp::x_RemoveRedundantComment( CGene_ref& gene, CSeq_feat & seq_feat )
{
    if( FIELD_IS_SET(seq_feat, Comment) ) {
        const string & comm = GET_FIELD(seq_feat, Comment);
        if ( STRING_FIELD_MATCH (gene, Desc, comm)) { 
            // only reset desc if there are other fields present
            if (gene.IsSetLocus() ||
                gene.IsSetAllele() ||
                gene.IsSetMaploc() ||
                gene.IsSetLocus_tag() ||
                gene.IsSetDb() ||
                gene.IsSetSyn()) {
                gene.ResetDesc();
                ChangeMade(CCleanupChange::eChangeQualifiers);
            } else {
                seq_feat.ResetComment();
                ChangeMade(CCleanupChange::eChangeComment);
            }
        }
        if ( STRING_FIELD_MATCH (gene, Locus, comm) ) {
           seq_feat.ResetComment();
           ChangeMade(CCleanupChange::eChangeComment);
        }
    }
}

void CNewCleanup_imp::x_ExceptTextEC(string& except_text)
{
    if (NStr::EqualNocase(except_text, "reasons cited in publication")) {
        except_text = "reasons given in citation";
        ChangeMade(CCleanupChange::eChangeException);
    }
}


bool CNewCleanup_imp::x_IsCodonCorrect(int codon_index, int gcode, unsigned char aa)
{
    if (codon_index > 63) {
        return false;
    }
    const string& ncbieaa = CGen_code_table::GetNcbieaa(gcode);
    if (ncbieaa.length() != 64) {
        return false;
    }
    unsigned char taa = ncbieaa[codon_index];

    if (taa == aa) {
        return true;
    } else if ((aa == 'U') && (taa == '*') && (codon_index == 14)) {
        // selenocysteine normally uses TGA (14), so ignore without requiring exception in record
        // TAG (11) is used for pyrrolysine in archaebacteria
        // TAA (10) is not yet known to be used for an exceptional amino acid
        return true;
    } else {
        return false;
    }
}


int s_LegalNcbieaaValues[] = { 42, 65, 66, 67, 68, 69, 70, 71, 72, 73,
    74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
    84, 85, 86, 87, 88, 89, 90 };

unsigned char s_GetAaAsChar(const CTrna_ext& trna)
{
    unsigned char aa = 0;
    vector<char> seqData;
    string str = "";

    switch (trna.GetAa().Which()) {
    case CTrna_ext::C_Aa::e_Iupacaa:
        str = trna.GetAa().GetIupacaa();
        CSeqConvert::Convert(str, CSeqUtil::e_Iupacaa, 0, (TSeqPos)str.size(), seqData, CSeqUtil::e_Ncbieaa);
        aa = seqData[0];
        break;
    case CTrna_ext::C_Aa::e_Ncbi8aa:
        str = trna.GetAa().GetNcbi8aa();
        CSeqConvert::Convert(str, CSeqUtil::e_Ncbi8aa, 0, (TSeqPos)str.size(), seqData, CSeqUtil::e_Ncbieaa);
        aa = seqData[0];
        break;
    case CTrna_ext::C_Aa::e_Ncbistdaa:
        str = trna.GetAa().GetNcbi8aa();
        CSeqConvert::Convert(str, CSeqUtil::e_Ncbistdaa, 0, (TSeqPos)str.size(), seqData, CSeqUtil::e_Ncbieaa);
        aa = seqData[0];
        break;
    case CTrna_ext::C_Aa::e_Ncbieaa:
        seqData.push_back(trna.GetAa().GetNcbieaa());
        aa = seqData[0];
        break;
    default:
        return ' ';
        break;
    }

    // make sure the amino acid is valid
    bool found = false;
    for (unsigned int i = 0; i < sizeof(s_LegalNcbieaaValues) / sizeof(int); ++i) {
        if (aa == s_LegalNcbieaaValues[i]) {
            found = true;
            break;
        }
    }
    if (!found) {
        aa = ' ';
    }
    return aa;
}


char s_Complement(char s)
{
    char c = s;
    if (s == 'A') {
        c = 'T';
    } else if (s == 'C') {
        c = 'G';
    } else if (s == 'G') {
        c = 'C';
    } else if (s == 'T') {
        c = 'A';
    }
    return c;
}

static string s_Complement(const string& str)
{
    string complement = "";
    ITERATE(string, s, str) {
        complement += s_Complement(*s);
    }
    return complement;
}

static string s_ReverseComplement(const string& str)
{
    string revcomp = "";
    ITERATE(string, s, str) {
        revcomp = s_Complement(*s) + revcomp;
    }
    return revcomp;
}

static string s_Reverse(const string& str)
{
    string reverse = "";
    ITERATE(string, s, str) {
        reverse = *s + reverse;
    }
    return reverse;
}


static bool s_IsRealTrna(const CSeq_feat& seq_feat)
{
    if (!seq_feat.IsSetData()) {
        return false;
    }
    const auto& fdata = seq_feat.GetData();
    return (fdata.GetSubtype() == CSeqFeatData::eSubtype_tRNA &&
            fdata.GetRna().IsSetExt() &&
            fdata.GetRna().GetExt().IsTRNA());
}


void CNewCleanup_imp::x_tRNACodonEC(CSeq_feat& seq_feat)
{
    if (!s_IsRealTrna(seq_feat)) {
        return;
    }
    CTrna_ext& trna = seq_feat.SetData().SetRna().SetExt().SetTRNA();
    if (!trna.IsSetAa() || !trna.IsSetCodon()) {
        return;
    }
    // Retrive the Genetic code id for the tRNA
    int gcode = 1;
    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq_feat.GetLocation());
    if (bsh) {
        // need only the closest biosoure.
        CSeqdesc_CI diter(bsh, CSeqdesc::e_Source);
        if (diter) {
            gcode = diter->GetSource().GetGenCode();
        }
    }

    unsigned char aa = s_GetAaAsChar(trna);
    if (aa == ' ') {
        return;
    }

    NON_CONST_ITERATE(CTrna_ext::TCodon, c, trna.SetCodon()) {
        if (*c == 255) continue; //universal
        if (*c > 63) continue; //invalid, cannot correct

        if (x_IsCodonCorrect(*c, gcode, aa)) continue; //already correct
        string codon = CGen_code_table::IndexToCodon(*c);

        // try reverse complement
        string revcomp = s_ReverseComplement(codon);
        int new_codon = CGen_code_table::CodonToIndex(revcomp);
        if (x_IsCodonCorrect(new_codon, gcode, aa)) {
            *c = new_codon;
            ChangeMade(CCleanupChange::eChange_tRna);
            continue;
        }

        // try complement
        string comp = s_Complement(codon);
        new_codon = CGen_code_table::CodonToIndex(comp);
        if (x_IsCodonCorrect(new_codon, gcode, aa)) {
            *c = new_codon;
            ChangeMade(CCleanupChange::eChange_tRna);
            continue;
        }

        // try reverse
        string reverse = s_Reverse(codon);
        new_codon = CGen_code_table::CodonToIndex(reverse);
        if (x_IsCodonCorrect(new_codon, gcode, aa)) {
            *c = new_codon;
            ChangeMade(CCleanupChange::eChange_tRna);
            continue;
        }
    }

    if (!CODON_ON_TRNAEXT_IS_SORTED(trna, s_CodonCompare)) {
        SORT_CODON_ON_TRNAEXT(trna, s_CodonCompare);
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    if (!CODON_ON_TRNAEXT_IS_UNIQUE(trna, s_CodonEqual)) {
        UNIQUE_CODON_ON_TRNAEXT(trna, s_CodonEqual);
        ChangeMade(CCleanupChange::eChange_tRna);
    }

    REMOVE_IF_EMPTY_CODON_ON_TRNAEXT(trna);
}


void CNewCleanup_imp::x_tRNAEC(CSeq_feat& seq_feat)
{
    if (!s_IsRealTrna(seq_feat)) {
        return;
    }

    x_tRNACodonEC(seq_feat);

}

void CNewCleanup_imp::x_RemoveEmptyUserObject( CSeq_descr & seq_descr )
{
    EDIT_EACH_SEQDESC_ON_SEQDESCR( descr_iter, seq_descr ) {
        CSeqdesc &desc = **descr_iter;
        if( ! FIELD_IS(desc, User) ) {
            continue;
        }

        bool needs_removal = false;

        CUser_object & user_obj = GET_MUTABLE(desc, User);

        // get type string, if any
        const string *pTypeStr = &kEmptyStr;
        if( FIELD_IS_SET_AND_IS(user_obj, Type, Str) ) {
            pTypeStr = &(user_obj.GetType().GetStr());
        }

        // remove user-objects with no data (except certain types)
        if( RAW_FIELD_IS_EMPTY_OR_UNSET(user_obj, Data) && 
            ! NStr::EqualNocase(*pTypeStr, "NcbiAutofix") &&
            ! NStr::EqualNocase(*pTypeStr, "Unverified") )
        {
            needs_removal = true;
        }

        if( needs_removal ) {
            ERASE_SEQDESC_ON_SEQDESCR(descr_iter, seq_descr);
            ChangeMade(CCleanupChange::eRemoveDescriptor);
        }
    }
}


// Helper for removing GenBank Block Keywords
struct SKeywordChecker
{
    CMolInfo::TTech m_Tech;
    bool operator()(const string& keyword)
    {
        if (NStr::Equal(keyword, "HTG")) {
            return true;
        }
        else if (m_Tech == CMolInfo::eTech_htgs_0 && NStr::Equal(keyword, "HTGS_PHASE0")) {
            return true;
        }
        else if (m_Tech == CMolInfo::eTech_htgs_1 && NStr::Equal(keyword, "HTGS_PHASE1")) {
            return true;
        }
        else if (m_Tech == CMolInfo::eTech_htgs_2 && NStr::Equal(keyword, "HTGS_PHASE2")) {
            return true;
        }
        else if (m_Tech == CMolInfo::eTech_htgs_3 && NStr::Equal(keyword, "HTGS_PHASE3")) {
            return true;
        }
        else if (m_Tech == CMolInfo::eTech_est && NStr::Equal(keyword, "EST")) {
            return true;
        }
        else if (m_Tech == CMolInfo::eTech_sts && NStr::Equal(keyword, "STS")) {
            return true;
        }
        else if (m_Tech == CMolInfo::eTech_survey && NStr::Equal(keyword, "GSS")) {
            return true;
        }
        else {
            return false;
        }
    }
};


bool CNewCleanup_imp::x_CleanGenbankKeywords(CGB_block& blk, CMolInfo::TTech tech)
{
    if (!blk.IsSetKeywords()) {
        return false;
    }
    bool any_change = false;
    auto& keywords = blk.SetKeywords();
    size_t orig = keywords.size();
    SKeywordChecker matcher{ tech };
    keywords.erase(std::remove_if(keywords.begin(), keywords.end(), matcher), keywords.end());

    if (keywords.empty()) {
        blk.ResetKeywords();
        any_change = true;
    } else if (keywords.size() != orig) {
        any_change = true;
    }
    return any_change;
}


bool s_SetMolinfoTechFromString(CMolInfo& molinfo, const string& keyword)
{
    if (NStr::Equal(keyword, "HTGS_PHASE0")) {
        molinfo.SetTech(CMolInfo::eTech_htgs_0);
        return true;
    } else if (NStr::Equal(keyword, "HTGS_PHASE1")) {
        molinfo.SetTech(CMolInfo::eTech_htgs_1);
        return true;
    } else if (NStr::Equal(keyword, "HTGS_PHASE2")) {
        molinfo.SetTech(CMolInfo::eTech_htgs_2);
        return true;
    } else if (NStr::Equal(keyword, "HTGS_PHASE3")) {
        molinfo.SetTech(CMolInfo::eTech_htgs_3);
        return true;
    } else if (NStr::Equal(keyword, "EST")) {
        molinfo.SetTech(CMolInfo::eTech_est);
        return true;
    } else if (NStr::Equal(keyword, "STS")) {
        molinfo.SetTech(CMolInfo::eTech_sts);
        return true;
    } else if (NStr::Equal(keyword, "GSS")) {
        molinfo.SetTech(CMolInfo::eTech_survey);
        return true;
    } else {
        return false;
    }
}


// if molinfo is missing tech, try to set it using GB_block.div
// note - may want to also do this with keywords later
void CNewCleanup_imp::x_SetMolInfoTechFromGenBankBlock(CSeq_descr& seq_descr, CGB_block& block)
{
    if (!block.IsSetDiv())
    {
        return;
    }
    NON_CONST_ITERATE(CSeq_descr::Tdata, it, seq_descr.Set()) {
        if ((*it)->IsMolinfo() &&
            !(*it)->GetMolinfo().IsSetTech()) {
            if (block.IsSetDiv() && s_SetMolinfoTechFromString((*it)->SetMolinfo(), block.GetDiv())) {
                block.ResetDiv();
                ChangeMade(CCleanupChange::eChangeMolInfo);
            }            
        }
    }
}


void CNewCleanup_imp::x_SetMolInfoTechFromGenBankBlock(CSeq_descr& seq_descr)
{
    NON_CONST_ITERATE(CSeq_descr::Tdata, it, seq_descr.Set()) {
        if ((*it)->IsGenbank()) {
            x_SetMolInfoTechFromGenBankBlock(seq_descr, (*it)->SetGenbank());
        }
    }
}


void CNewCleanup_imp::x_CleanupGenbankBlock(CBioseq_set& set)
{
    if (!set.IsSetDescr()) {
        return;
    }
    auto& dset = set.SetDescr().Set();

    CConstRef<CBioSource> biosrc(NULL);
    CMolInfo::TTech tech = CMolInfo::eTech_unknown;

    for (auto it : dset) {
        if (it->IsSource()) {
            biosrc.Reset(&(it->GetSource()));
        } else if (it->IsMolinfo() &&
                   it->GetMolinfo().IsSetTech()) {
            tech = it->GetMolinfo().GetTech();
        }
    }

    for (auto descr_iter : dset) {
        CSeqdesc &desc = *descr_iter;
        if (!FIELD_IS(desc, Genbank)) {
            continue;
        }

        CGB_block& gb = desc.SetGenbank();
        x_CleanupGenbankBlock(gb, false, biosrc, tech);
    }

}


string s_GetDiv(const CBioSource& src)
{
    if (!src.IsSetOrg()) {
        return kEmptyCStr;
    }
    const auto& org = src.GetOrg();
    if (!org.IsSetOrgname()) {
        return kEmptyStr;
    }
    const auto& orgname = org.GetOrgname();
    if (orgname.IsSetDiv()) {
        return src.GetOrg().GetOrgname().GetDiv();
    } else {
        return kEmptyCStr;
    }
}


void RemoveStrain(string& src, const CBioSource& biosrc)
{
    if (!biosrc.IsSetOrg()) {
        return;
    }
    const auto& org = biosrc.GetOrg();
    if (!org.IsSetOrgname()) {
        return;
    }
    const auto& orgname = org.GetOrgname();
    if (!orgname.IsSetMod()) {
        return;
    }
    size_t pos = NStr::Find(src, "(strain ");
    if (pos == string::npos) {
        return;
    }

    for (auto it : orgname.GetMod()) {
        if (it->IsSetSubtype() &&
            it->GetSubtype() == COrgMod::eSubtype_strain &&
            it->IsSetSubname()) {
            const string& strain = it->GetSubname();
            string expected = "(strain " + strain + ")";
            NStr::ReplaceInPlace(src, expected, "");
            NStr::ReplaceInPlace(src, "  ", " ");
            NStr::TruncateSpacesInPlace(src);
        }
    }
}


bool CNewCleanup_imp::x_CanRemoveGenbankBlockSource(const string& src, const CBioSource& biosrc)
{
    string compare = src;
    if (NStr::EndsWith(compare, " DNA.")) {
        compare = compare.substr(0, compare.length() - 5);
    } else if (NStr::EndsWith(compare, " rRNA.")) {
        compare = compare.substr(0, compare.length() - 6);
    }
    if (NStr::EndsWith(compare, ".")) {
        compare = compare.substr(0, compare.length() - 1);
        NStr::TruncateSpacesInPlace(compare);
    }
    RemoveStrain(compare, biosrc);

    if (biosrc.IsSetOrg()) {
        const auto& org = biosrc.GetOrg();
        if (org.IsSetTaxname() &&
            NStr::Equal(compare, org.GetTaxname())) {
            return true;
        }
        if (org.IsSetCommon() &&
            NStr::Equal(compare, org.GetCommon())) {
            return true;
        }
        if (org.IsSetOrgname()) {
            const auto& orgname = org.GetOrgname();
            if (orgname.IsSetMod()) {
                for (auto m : orgname.GetMod()) {
                    if (m->IsSetSubtype() &&
                        m->GetSubtype() == COrgMod::eSubtype_old_name &&
                        m->IsSetSubname() &&
                        NStr::Equal(m->GetSubname(), compare)) {
                        return true;
                    }
                }
            }
        }
    }
    return false;
}


void CNewCleanup_imp::x_CleanupGenbankBlock(CGB_block& gb, bool is_patent, CConstRef<CBioSource> biosrc, CMolInfo::TTech tech)
{
    if (gb.IsSetDiv()) {
        if (biosrc && NStr::Equal(gb.GetDiv(), s_GetDiv(*biosrc))) {
            gb.ResetDiv();
            ChangeMade(CCleanupChange::eChangeOther);
        } else if (is_patent && NStr::Equal(gb.GetDiv(), "PAT")) {
            gb.ResetDiv();
            ChangeMade(CCleanupChange::eChangeOther);
        } else if (NStr::Equal(gb.GetDiv(), "HTG")) {
            if (tech == CMolInfo::eTech_htgs_0 ||
                tech == CMolInfo::eTech_htgs_1 ||
                tech == CMolInfo::eTech_htgs_2 ||
                tech == CMolInfo::eTech_htgs_3) {
                gb.ResetDiv();
                ChangeMade(CCleanupChange::eChangeOther);
            }
        }
        else if (SKeywordChecker{ tech }(gb.GetDiv())) {
            gb.ResetDiv();
            ChangeMade(CCleanupChange::eChangeOther);
        }
    } if (gb.IsSetSource() && biosrc && x_CanRemoveGenbankBlockSource(gb.GetSource(), *biosrc)) {
        gb.ResetSource();
        ChangeMade(CCleanupChange::eChangeOther);
    }
    if (x_CleanGenbankKeywords(gb, tech)) {
        ChangeMade(CCleanupChange::eChangeKeywords);
    }
}


void CNewCleanup_imp::x_CleanupGenbankBlock(CBioseq& seq)
{
    if (!seq.IsSetDescr()) {
        return;
    }
    x_SetMolInfoTechFromGenBankBlock(seq.SetDescr());
    bool is_patent = false;
    ITERATE(CBioseq::TId, id, seq.GetId()) {
        if ((*id)->IsPatent()) {
            is_patent = true;
        }
    }
    CBioseq_Handle b = m_Scope->GetBioseqHandle(seq);
    CConstRef<CBioSource> biosrc(NULL);
    CSeqdesc_CI src(b, CSeqdesc::e_Source);
    if (src) {
        biosrc.Reset(&(src->GetSource()));
    }
    CMolInfo::TTech tech = CMolInfo::eTech_unknown;
    CSeqdesc_CI molinfo(b, CSeqdesc::e_Molinfo);
    if (molinfo && molinfo->GetMolinfo().IsSetTech()) {
        tech = molinfo->GetMolinfo().GetTech();
    }

    EDIT_EACH_SEQDESC_ON_SEQDESCR(descr_iter, seq.SetDescr()) {
        CSeqdesc &desc = **descr_iter;
        if (!FIELD_IS(desc, Genbank)) {
            continue;
        }

        CGB_block& gb = desc.SetGenbank();
        x_CleanupGenbankBlock(gb, is_patent, biosrc, tech);
    }
}


void CNewCleanup_imp::x_CleanupGenbankBlock( CSeq_descr & seq_descr )
{
    EDIT_EACH_SEQDESC_ON_SEQDESCR( descr_iter, seq_descr ) {
        CSeqdesc &desc = **descr_iter;
        if( ! FIELD_IS(desc, Genbank) ) {
            continue;
        }

        CGB_block& gb = desc.SetGenbank();

        if (gb.IsSetTaxonomy()) {
            gb.ResetTaxonomy();
            ChangeMade(CCleanupChange::eChangeOther);
        }

        if (gb.IsSetDiv()) {
            if (NStr::Equal(gb.GetDiv(), "UNA") ||
                NStr::Equal(gb.GetDiv(), "UNC") ||
                NStr::IsBlank(gb.GetDiv())) {
                gb.ResetDiv();
                ChangeMade(CCleanupChange::eChangeOther);
            }
        }

    }
  
}


CMolInfo::TBiomol s_BiomolFromGIBBMolType(EGIBB_mol mol)
{
    switch (mol) {
        case eGIBB_mol_genomic:
            return CMolInfo::eBiomol_genomic;
            break;
        case eGIBB_mol_genomic_mRNA:
            return CMolInfo::eBiomol_genomic_mRNA;
            break;
        case eGIBB_mol_mRNA:
            return CMolInfo::eBiomol_mRNA;
            break;
        case eGIBB_mol_other:
            return CMolInfo::eBiomol_other;
            break;
        case eGIBB_mol_other_genetic:
            return CMolInfo::eBiomol_other_genetic;
            break;
        case eGIBB_mol_peptide:
            return CMolInfo::eBiomol_peptide;
            break;
        case eGIBB_mol_pre_mRNA:
            return CMolInfo::eBiomol_pre_RNA;
            break;
        case eGIBB_mol_rRNA:
            return CMolInfo::eBiomol_rRNA;
            break;
        case eGIBB_mol_scRNA:
            return CMolInfo::eBiomol_scRNA;
            break;
        case eGIBB_mol_snRNA:
            return CMolInfo::eBiomol_snRNA;
            break;
        case eGIBB_mol_tRNA:
            return CMolInfo::eBiomol_tmRNA;
            break;
        case eGIBB_mol_unknown:
            return CMolInfo::eBiomol_unknown;
            break;
    }
    return CMolInfo::eBiomol_unknown;
}


CMolInfo::TTech s_TechFromGIBBMethod(EGIBB_method method)
{
    switch (method)
    {
        case eGIBB_method_concept_trans:
            return CMolInfo::eTech_concept_trans;
            break;
        case eGIBB_method_seq_pept:
            return CMolInfo::eTech_seq_pept;
            break;
        case eGIBB_method_both:
            return CMolInfo::eTech_both;
            break;
        case eGIBB_method_seq_pept_overlap:
            return CMolInfo::eTech_seq_pept_overlap;
            break;
        case eGIBB_method_seq_pept_homol:
            return CMolInfo::eTech_seq_pept;
            break;
        case eGIBB_method_concept_trans_a:
            return CMolInfo::eTech_concept_trans_a;
            break;
        case eGIBB_method_other:
            return CMolInfo::eTech_other;
            break;

    }
    return CMolInfo::eTech_other;
}


bool SetMolinfoFromGIBBMod(CMolInfo& mi, EGIBB_mod mod)
{
    bool changed = false;
    switch (mod) {
        case eGIBB_mod_partial:
            mi.SetCompleteness(CMolInfo::eCompleteness_partial);
            changed = true;
            break;
        case eGIBB_mod_complete:
            mi.SetCompleteness(CMolInfo::eCompleteness_complete);
            changed = true;
            break;
        case eGIBB_mod_no_left:
            mi.SetCompleteness(CMolInfo::eCompleteness_no_left);
            changed = true;
            break;
        case eGIBB_mod_no_right:
            mi.SetCompleteness(CMolInfo::eCompleteness_no_right);
            changed = true;
            break;
        case eGIBB_mod_est:
            mi.SetTech(CMolInfo::eTech_est);
            changed = true;
            break;
        case eGIBB_mod_sts:
            mi.SetCompleteness(CMolInfo::eTech_sts);
            changed = true;
            break;
        case eGIBB_mod_survey:
            mi.SetCompleteness(CMolInfo::eTech_survey);
            changed = true;
            break;
        default:
            break;
    }
    return changed;
}


void CNewCleanup_imp::x_RescueMolInfo(CBioseq& seq)
{
    if (!seq.IsSetDescr()) {
        return;
    }

    CRef<CSeqdesc> d(NULL);
    CRef<CMolInfo> mi(new CMolInfo());
    CSeq_descr::Tdata::iterator it = seq.SetDescr().Set().begin();
    while (it != seq.SetDescr().Set().end()) {
        if ((*it)->IsMolinfo()) {
            d = *it;
            mi->Assign((*it)->GetMolinfo());
        }
        ++it;
    }
    bool any_change = false;
    it = seq.SetDescr().Set().begin();
    while (it != seq.SetDescr().Set().end()) {
        bool erase = false;
        if ((*it)->IsMol_type()) {
            CMolInfo::TBiomol biomol = s_BiomolFromGIBBMolType((*it)->GetMol_type());
            if (!mi->IsSetBiomol()) {
                mi->SetBiomol(biomol);
                any_change = true;
                erase = true;
            } else if (mi->GetBiomol() == biomol) {
                erase = true;
            }
        } else if ((*it)->IsMethod()) {
            CMolInfo::TTech tech = s_TechFromGIBBMethod((*it)->GetMethod());
            if (!mi->IsSetTech()) {
                mi->SetTech(tech);
                any_change = true;
                erase = true;
            } else if (mi->GetTech() == tech) {
                erase = true;
            }
        } else if ((*it)->IsModif()) {
            ITERATE(CSeqdesc::TModif, m, (*it)->GetModif()) {
                any_change |= SetMolinfoFromGIBBMod(*mi, *m);
            }
        }
        if (erase) {
            it = seq.SetDescr().Set().erase(it);
            ChangeMade(CCleanupChange::eRemoveDescriptor);
        } else {
            ++it;
        }
    }
    if (any_change) {
        if (d) {
            d->SetMolinfo().Assign(*mi);
            ChangeMade(CCleanupChange::eChangeMolInfo);
        } else {
            d.Reset(new CSeqdesc());
            d->SetMolinfo().Assign(*mi);
            seq.SetDescr().Set().push_back(d);
            ChangeMade(CCleanupChange::eAddDescriptor);
        }
    }
}


void CNewCleanup_imp::x_RemoveOldDescriptors( CSeq_descr & seq_descr )
{
    EDIT_EACH_SEQDESC_ON_SEQDESCR( d, seq_descr ) {
        switch ((*d)->Which()) {
            case CSeqdesc::e_Mol_type:
            case CSeqdesc::e_Method:
            case CSeqdesc::e_Org:
                ERASE_SEQDESC_ON_SEQDESCR(d, seq_descr);
ChangeMade(CCleanupChange::eRemoveDescriptor);
break;
            default:
                break;
        }
    }
}


bool CNewCleanup_imp::x_IsGenbankBlockEmpty(const CGB_block& gbk)
{
    if ((gbk.IsSetExtra_accessions() && !gbk.GetExtra_accessions().empty()) ||
        (gbk.IsSetSource() && !NStr::IsBlank(gbk.GetSource())) ||
        (gbk.IsSetKeywords() && !gbk.GetKeywords().empty()) ||
        (gbk.IsSetOrigin() && !NStr::IsBlank(gbk.GetOrigin())) ||
        (gbk.IsSetDate() && !NStr::IsBlank(gbk.GetDate())) ||
        (gbk.IsSetDiv() && !NStr::IsBlank(gbk.GetDiv()))) {
        return false;
    } else {
        return true;
    }
}


void CNewCleanup_imp::x_RemoveEmptyDescriptors(CSeq_descr& seq_descr)
{
    EDIT_EACH_SEQDESC_ON_SEQDESCR(d, seq_descr) {
        if ((*d)->IsPub() && x_IsPubContentBad((*d)->GetPub(), false)) {
            ERASE_SEQDESC_ON_SEQDESCR(d, seq_descr);
            ChangeMade(CCleanupChange::eRemoveDescriptor);
        } else if ((*d)->IsGenbank()) {
            CGB_block& blk = (*d)->SetGenbank();
            if (blk.IsSetTaxonomy()) {
                blk.ResetTaxonomy();
                ChangeMade(CCleanupChange::eChangeOther);
            }
            if (x_IsGenbankBlockEmpty(blk)) {
                ERASE_SEQDESC_ON_SEQDESCR(d, seq_descr);
                ChangeMade(CCleanupChange::eRemoveDescriptor);
            }
        }
    }
}


bool CNewCleanup_imp::x_CleanEmptyGene(CGene_ref& gene)
{
    bool any_change = false;
    if (gene.IsSetLocus() &&
        NStr::IsBlank(gene.GetLocus())) {
        gene.ResetLocus();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (gene.IsSetAllele() &&
        NStr::IsBlank(gene.GetAllele())) {
        gene.ResetAllele();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (gene.IsSetDesc() &&
        NStr::IsBlank(gene.GetDesc())) {
        gene.ResetDesc();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (gene.IsSetMaploc() &&
        NStr::IsBlank(gene.GetMaploc())) {
        gene.ResetMaploc();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (gene.IsSetLocus_tag() &&
        NStr::IsBlank(gene.GetLocus_tag())) {
        gene.ResetLocus_tag();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (gene.IsSetDb() && gene.GetDb().empty()) {
        gene.ResetDb();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (gene.IsSetSyn() && gene.GetSyn().empty()) {
        gene.ResetSyn();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    return any_change;
}


bool s_FeatureHasEvidenceOrInferenceQuals(const CSeq_feat& feat)
{
    if (!feat.IsSetQual()) {
        return false;
    }
    ITERATE(CSeq_feat::TQual, it, feat.GetQual()) {
        if ((*it)->IsSetQual() &&
            (NStr::Equal((*it)->GetQual(), "evidence") ||
            (NStr::Equal((*it)->GetQual(), "inference")))) {
            return true;
        }
    }
    return false;
}


bool CNewCleanup_imp::x_ShouldRemoveEmptyGene(const CGene_ref& gene, const CSeq_feat& feat)
{
    bool should_remove = false;
    if (!gene.IsSetLocus() &&
        !gene.IsSetAllele() &&
        !gene.IsSetDesc() &&
        !gene.IsSetMaploc() &&
        !gene.IsSetLocus_tag() &&
        !gene.IsSetDb() &&
        !gene.IsSetSyn() &&
        !gene.IsSetPseudo() &&
        !feat.IsSetPseudo() &&
        !feat.IsSetExp_ev() &&
        !s_FeatureHasEvidenceOrInferenceQuals(feat)) {
        should_remove = true;
    }
    return should_remove;
}


bool CNewCleanup_imp::x_CleanEmptyProt(CProt_ref& prot)
{
    bool any_change = false;
    if (prot.IsSetName() &&
        (prot.GetName().empty() ||
        NStr::IsBlank(prot.GetName().front()))) {
        prot.ResetName();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (prot.IsSetEc() && prot.GetEc().empty()) {
        prot.ResetEc();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (prot.IsSetDb() && prot.GetDb().empty()) {
        prot.ResetDb();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (prot.IsSetActivity() && prot.GetActivity().empty()) {
        prot.ResetActivity();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    if (prot.IsSetDesc() && NStr::IsBlank(prot.GetDesc())) {
        prot.ResetDesc();
        ChangeMade(CCleanupChange::eChangeOther);
        any_change = true;
    }
    return any_change;
}


bool CNewCleanup_imp::x_ShouldRemoveEmptyProt(const CProt_ref& prot)
{
    if (prot.IsSetProcessed() && prot.GetProcessed() != CProt_ref::eProcessed_not_set) {
        return false;
    }

    bool should_remove = false;
    if (!prot.IsSetName() &&
        !prot.IsSetDesc() &&
        !prot.IsSetEc() &&
        !prot.IsSetActivity() &&
        !prot.IsSetDb()) {
        should_remove = true;
    }
    return should_remove;
}


// if bond is other and comment can be used to set bond type, do so.
void CNewCleanup_imp::x_BondEC(CSeq_feat& feat)
{
    if (!feat.IsSetData()) {
        return;
    }
    const auto& fdata = feat.GetData();
    if (!fdata.IsImp()) {
        return;
    }
    const auto& imp = fdata.GetImp();
    if (imp.IsSetKey() &&
        NStr::Equal(imp.GetKey(), "misc_feature") &&
        feat.IsSetComment() &&
        NStr::EndsWith(feat.GetComment(), " bond")) {
        string bond_type = feat.GetComment().substr(0, feat.GetComment().length() - 5);
        CBondList bl;
        if (bl.IsBondName(bond_type)) {
            feat.SetData().SetBond(CSeqFeatData::eBond_other);
            ChangeMade(CCleanupChange::eChangeOther);
        }        
    }
}


bool HasAuthor(const CAuthor& author)
{
    if (!author.IsSetName()) {
        return false;
    }
    if (author.GetName().IsName()) {
        const auto& aname = author.GetName().GetName();
        if (!aname.IsSetLast() || NStr::IsBlank(aname.GetLast())) {
            return false;
        } else {
            return true;
        }
    } else if (author.GetName().IsConsortium()) {
        return !NStr::IsBlank(author.GetName().GetConsortium());
    } else if (author.GetName().IsStr()) {
        return !NStr::IsBlank(author.GetName().GetStr());
    } else {
        return false;
    }
}


bool HasAuthor(const CAuth_list& auth_list)
{
    bool has_name = false;
    if (!auth_list.IsSetNames()) {
        return false;
    }
    if (auth_list.GetNames().IsStd()) {
        const auto& stdauth = auth_list.GetNames().GetStd();
        for (auto it : stdauth) {
            if (!it->CanGetName()) {
                continue;
            }
            const CPerson_id& pid = it->GetName();
            if (pid.IsName()  ||  pid.IsMl()  ||  pid.IsStr() || pid.IsConsortium()) {
                has_name = true;
                break;
            }
        }
    } else if (auth_list.GetNames().IsMl()) {
        if (!auth_list.GetNames().GetMl().empty() &&
            !NStr::IsBlank(auth_list.GetNames().GetMl().front())) {
            has_name = true;
        }
    } else if (auth_list.GetNames().IsStr()) {
        if (!auth_list.GetNames().GetStr().empty() &&
            !NStr::IsBlank(auth_list.GetNames().GetStr().front())) {
            has_name = true;
        }
    }
    return has_name;
}


bool HasAuthor(const CPubdesc& pub, bool strict)
{
    if (!pub.IsSetPub()) {
        return false;
    }
    
    bool any_authors = false;
    const auto& pubset = pub.GetPub().Get();
    for (auto it : pubset) {
        if (it->IsPatent()) {
            if (!strict) {
                // if patent and not strict, just patent is ok
                return true;
            }
        }
        if (it->IsSetAuthors()) {
            any_authors = true;
            if (HasAuthor(it->GetAuthors())) {
                return true;
            }
        }
    }
    if (strict) {
        return false;
    } else if (any_authors) {
        return false;
    } else {
        return true;
    }
}


bool IsMinimal(const CCit_gen& gen)
{
    // note: Yes, a gen pub is minimal if it DOES have a citation
    // but not journal, authors, volume, and pages
    // copied logic from C Toolkit
    if (gen.IsSetCit() &&
        !gen.IsSetJournal() &&
        !gen.IsSetAuthors() &&
        !gen.IsSetVolume() &&
        !gen.IsSetPages()) {
        return true;
    } else {
        return false;
    }
}

#define CHECK_TITLE(Title_type) \
    case CTitle::C_E::e_##Title_type: \
        if (!NStr::IsBlank(title->Get##Title_type())) { \
            has_title = true; \
        } \
        break;


bool CNewCleanup_imp::x_IsPubContentBad(const CId_pat& pat)
{
    if (pat.IsSetCountry() && !NStr::IsBlank(pat.GetCountry())) {
        return false;
    }
    if (pat.IsSetDoc_type() && !NStr::IsBlank(pat.GetDoc_type())) {
        return false;
    }
    if (!pat.IsSetId()) {
        return false;
    } 
    const auto& id = pat.GetId();
    if (id.IsApp_number() && !NStr::IsBlank(id.GetApp_number())) {
        return false;
    } else if (id.IsNumber() && !NStr::IsBlank(id.GetNumber())) {
        return false;
    }

    return true;
}


bool CNewCleanup_imp::x_IsPubContentBad(const CPub& pub)
{
    if (pub.IsGen() && IsMinimal(pub.GetGen())) {
        return true;
    } else if (pub.IsMuid() && pub.GetMuid() == ZERO_ENTREZ_ID) {
        return true;
    } else if (pub.IsPmid() && pub.GetPmid() == ZERO_ENTREZ_ID) {
        return true;
    } else if (pub.IsPat_id() && x_IsPubContentBad(pub.GetPat_id())) {
        return true;
    } else {
        return false;
    }
}


bool CNewCleanup_imp::x_IsPubContentBad(const CPubdesc& pub, bool strict)
{
    // remove if no pubs at all
    if (!pub.IsSetPub()) {
        return true;
    }
    const auto& pubset = pub.GetPub().Get();
    if (pubset.empty()) {
        return true;
    }

    // keep anything with a figure - backbone entry
    if (pub.IsSetFig() && !NStr::IsBlank(pub.GetFig())) {
        return false;
    }

    // if strict, must have at least one author name
    if (!HasAuthor(pub, strict)) {
        return true;
    }

    // remove if only one pub and that pub is bad
    if (pubset.size() == 1 &&
        x_IsPubContentBad(*(pub.GetPub().Get().front()))) {
        return true;
    }

    for (auto it : pubset) {
        if (it->IsArticle()) {
            // all CitArt from journal must have journal title and imprint
            const CCit_art& art = it->GetArticle();
            if (art.IsSetFrom() && art.GetFrom().IsJournal()) {
                const CCit_jour& jour = art.GetFrom().GetJournal(); 
                if (!jour.IsSetImp()) {
                    return true;
                }
                bool has_title = false;
                if (jour.IsSetTitle()) {
                    const auto& titles = jour.GetTitle().Get();
                    for (auto title : titles) {
                        switch (title->Which()) {
                            CHECK_TITLE(Name)
                            CHECK_TITLE(Tsub)
                            CHECK_TITLE(Trans)
                            CHECK_TITLE(Jta)
                            CHECK_TITLE(Iso_jta)
                            CHECK_TITLE(Ml_jta)
                            CHECK_TITLE(Coden)
                            CHECK_TITLE(Issn)
                            CHECK_TITLE(Abr)
                            CHECK_TITLE(Isbn)
                            default:
                                break;
                        }
                    }                    
                }
                if (!has_title) {
                    return true;
                }
            }
        }
    }
    return false;
}


bool CNewCleanup_imp::x_ShouldRemoveEmptyPub(const CPubdesc& pub)
{
    return x_IsPubContentBad(pub, false);
}


bool CNewCleanup_imp::x_CleanEmptyFeature(CSeq_feat& feat)
{
    bool any_change = false;
    if (!feat.IsSetData()) {
        return false;
    }
    switch (feat.GetData().Which()) {
    case CSeqFeatData::e_Gene:
        any_change = x_CleanEmptyGene(feat.SetData().SetGene());
        if (x_ShouldRemoveEmptyGene(feat.GetData().GetGene(), feat) &&
            feat.IsSetComment() && !NStr::IsBlank(feat.GetComment())) {
            feat.SetData().SetImp().SetKey("misc_feature");
            any_change = true;
        }
        break;
    case CSeqFeatData::e_Prot:
        any_change = x_CleanEmptyProt(feat.SetData().SetProt());
        if (x_ShouldRemoveEmptyProt(feat.GetData().GetProt()) &&
            feat.IsSetComment() && !NStr::IsBlank(feat.GetComment())) {
            if (!NStr::EqualNocase(feat.GetComment(), "putative")) {
                feat.SetData().SetProt().SetName().push_back(feat.GetComment());
                feat.ResetComment();
            }
            any_change = true;
        }
        break;
    default:
        break;
    }

    return any_change;
}


bool CNewCleanup_imp::x_ShouldRemoveEmptyFeature(const CSeq_feat& feat)
{
    bool is_empty = false;

    if (!feat.IsSetData()) {
        return false;
    }
    switch (feat.GetData().Which()) {
        case CSeqFeatData::e_Gene:
            is_empty = x_ShouldRemoveEmptyGene(feat.GetData().GetGene(), feat);
            break;
        case CSeqFeatData::e_Prot:
            is_empty = x_ShouldRemoveEmptyProt(feat.GetData().GetProt());
            break;
        case CSeqFeatData::e_Pub:
            is_empty = x_ShouldRemoveEmptyPub(feat.GetData().GetPub());
            break;
        case CSeqFeatData::e_Comment:
            if (!feat.IsSetComment() || NStr::IsBlank(feat.GetComment())) {
                is_empty = true;
            }
            break;
        default:
            break;
    }
    return is_empty;
}

void CNewCleanup_imp::x_RemoveEmptyFeatures( CSeq_annot & seq_annot )
{
    if (seq_annot.IsFtable()) {
        bool any_erasures = true;
        while (any_erasures) {
            any_erasures = false;
            auto& ftable = seq_annot.SetData().SetFtable();
            CSeq_annot::C_Data::TFtable::iterator it = ftable.begin();
            while (it != ftable.end()) {
                CRef<CSeq_feat> editable(new CSeq_feat());
                editable->Assign(**it);
                bool changed = x_CleanEmptyFeature(*editable);
                bool should_remove = x_ShouldRemoveEmptyFeature(*editable);
                if (should_remove) {
                    CSeq_feat_Handle fh = m_Scope->GetSeq_featHandle(**it);
                    CSeq_feat_EditHandle eh(fh);
                    eh.Remove();
                    any_erasures = true;
                    ChangeMade(CCleanupChange::eRemoveFeat);
                    break;
                } else if (changed) {
                    CSeq_feat_Handle fh = m_Scope->GetSeq_featHandle(**it);
                    CSeq_feat_EditHandle eh(fh);
                    eh.Replace(*editable);
                }
                ++it;
            }
        }
    }
}


bool s_IsGenomeAnnotationStart(const CUser_object& user)
{
    if (user.GetObjectType() == CUser_object::eObjectType_StructuredComment &&
        user.IsSetData()) {
        try {
            const CUser_field& prefix = user.GetField("StructuredCommentPrefix");
            if (prefix.IsSetData() && prefix.GetData().IsStr() &&
                NStr::Equal(prefix.GetData().GetStr(), "##Genome-Annotation-Data-START##")) {
                return true;
            }
        } catch (CException&) {

        }
    }
    return false;                
}


bool s_RetainEmptyAnnot(const CSeq_annot& annot)
{
    if (!annot.IsSetDesc()) {
        return false;
    }
    const auto& adesc = annot.GetDesc().Get();
    for (auto it : adesc) {
        if (it->IsUser() && s_IsGenomeAnnotationStart(it->GetUser())) {
            return true;
        }
    }
    return false;
}


bool CNewCleanup_imp::ShouldRemoveAnnot(const CSeq_annot& annot)
{
    if (!s_RetainEmptyAnnot(annot) &&
        ((annot.IsFtable() && annot.GetData().GetFtable().empty()) ||
         !annot.IsSetData())) {
        return true;
    } else {
        return false;
    }
}


void CNewCleanup_imp::x_RemoveEmptyFeatureTables( CBioseq & bioseq )
{
    if (bioseq.IsSetAnnot()) {
        bool any_erasures = true;
        while (any_erasures) {
            any_erasures = false;
            CBioseq::TAnnot::iterator it = bioseq.SetAnnot().begin();
            while (it != bioseq.SetAnnot().end()) {
                if ((*it)->IsFtable()) {
                    x_RemoveEmptyFeatures(**it);
                }
                if (ShouldRemoveAnnot(**it)) {
                    CSeq_annot_Handle ah = m_Scope->GetSeq_annotHandle(**it);
                    CSeq_annot_EditHandle eh(ah);
                    eh.Remove();
                    any_erasures = true;
                    ChangeMade(CCleanupChange::eChangeOther);
                    break;
                } else {
                    ++it;
                }
            }
        }
        if (bioseq.GetAnnot().empty()) {
            bioseq.ResetAnnot();
            ChangeMade(CCleanupChange::eChangeOther);
        }
    }
}

void CNewCleanup_imp::x_RemoveEmptyFeatureTables( CBioseq_set & bioseq_set )
{
    if (bioseq_set.IsSetAnnot()) {
        bool any_erasures = true;
        while (any_erasures) {
            any_erasures = false;
            CBioseq::TAnnot::iterator it = bioseq_set.SetAnnot().begin();
            while (it != bioseq_set.SetAnnot().end()) {
                if ((*it)->IsFtable()) {
                    x_RemoveEmptyFeatures(**it);
                } 
                if (ShouldRemoveAnnot(**it)) {
                    CSeq_annot_Handle ah = m_Scope->GetSeq_annotHandle(**it);
                    CSeq_annot_EditHandle eh(ah);
                    eh.Remove();
                    any_erasures = true;
                    ChangeMade(CCleanupChange::eChangeOther);
                    break;
                } else {
                    ++it;
                }
            }
        }
    }
    if (bioseq_set.GetAnnot().empty()) {
        bioseq_set.ResetAnnot();
        ChangeMade(CCleanupChange::eChangeOther);
    }
}


bool s_IsMergeableFeatureTable(const CSeq_annot& annot)
{
    if (!annot.IsFtable() ||
        annot.IsSetId() ||
        annot.IsSetName() ||
        annot.IsSetDb() ||
        annot.IsSetDesc()) {
        return false;
    } else {
        return true;
    }

}


void CNewCleanup_imp::x_MergeAdjacentFeatureTables(list< CRef< CSeq_annot > > & annot_list)
{
    if (annot_list.size() < 2) {
        return;
    }
    bool any_erased = true;
    while (any_erased) {
        any_erased = false;
        CBioseq::TAnnot::iterator it = annot_list.begin();
        CBioseq::TAnnot::iterator it_next = it;
        ++it_next;
        while (it_next != annot_list.end())
        {
            if (s_IsMergeableFeatureTable(**it) &&
                s_IsMergeableFeatureTable(**it_next)) {
                CSeq_annot_EditHandle eh1 = m_Scope->GetSeq_annotEditHandle(**it);
                CSeq_annot_EditHandle eh2 = m_Scope->GetSeq_annotEditHandle(**it_next);
                while ((*it_next)->IsSetData() && !(*it_next)->GetData().GetFtable().empty()) {
                    CSeq_feat_Handle fh = m_Scope->GetSeq_featHandle(*((*it_next)->GetData().GetFtable().front()));
                    CSeq_feat_EditHandle efh(fh);
                    eh1.TakeFeat(efh);
                }
                eh2.Remove();
                ChangeMade(CCleanupChange::eRemoveAnnot);
                any_erased = true;
                break;
            }
            ++it_next;
            ++it;
        }
    }
}


void CNewCleanup_imp::x_MergeAdjacentFeatureTables( CBioseq & bioseq )
{
    if (bioseq.IsSetAnnot()) {
        x_MergeAdjacentFeatureTables(bioseq.SetAnnot());
    }
}


void CNewCleanup_imp::x_MergeAdjacentFeatureTables( CBioseq_set & bioseq_set )
{
    if (bioseq_set.IsSetAnnot()) {
        x_MergeAdjacentFeatureTables(bioseq_set.SetAnnot());
    }
}


CRef<CBioSource> BioSourceFromImpFeat(const CSeq_feat& sf)
{
    CRef<CBioSource> src(NULL);
    if (!sf.IsSetQual()) {
        return src;
    }

    ITERATE(CSeq_feat::TQual, it, sf.GetQual()) {
        if ((*it)->IsSetQual() && NStr::Equal((*it)->GetQual(), "organism") && (*it)->IsSetVal()) {
            src.Reset(new CBioSource());
            src->SetOrg().SetTaxname((*it)->GetVal());
        }
    }
    if (!src) {
        return src;
    }
    
    ITERATE(CSeq_feat::TQual, it, sf.GetQual()) {
        if ((*it)->IsSetQual() && (*it)->IsSetVal()) {
            const string qual = NStr::Replace(GET_FIELD(**it, Qual), "_", "-");
            const string &val = GET_FIELD(**it, Val);

            // determine whether we should convert this gbqual into an orgmod
            string mod_val = qual + "=" + val;
            size_t val_pos;
            COrgMod::TSubtype ost;
            CSubSource::TSubtype sst;
            bool do_gbqual_to_orgmod =
                s_StringHasOrgModPrefix(mod_val, val_pos, ost) ||
                s_StringHasSubSourcePrefix(mod_val, val_pos, sst);

            // if required, do the conversion
            if (do_gbqual_to_orgmod) {
                src->SetOrg().SetMod().push_back(mod_val);
            }
            CBioSource::EGenome genome = CBioSource::GetGenomeByOrganelle(qual);

            if (genome != CBioSource::eGenome_unknown && 
                (!src->IsSetGenome() || 
                (src->GetGenome() == CBioSource::eGenome_mitochondrion && genome == CBioSource::eGenome_kinetoplast))) {
                src->SetGenome(genome);
            }
        }
    }
    if (sf.IsSetComment() && !NStr::IsBlank(sf.GetComment())) {
        CRef<COrgMod> note(new COrgMod());
        note->SetSubtype(COrgMod::eSubtype_other);
        note->SetSubname(sf.GetComment());
        src->SetOrg().SetOrgname().SetMod().push_back(note);
    }
    return src;
}


// part of extended cleanup
void CNewCleanup_imp::x_RemoveOldFeatures(CBioseq & bioseq)
{
    CBioseq_Handle bh = m_Scope->GetBioseqHandle(bioseq);

    CSeqdesc_CI src(bh, CSeqdesc::e_Source);
    bool any_erasures = true;
    while (any_erasures) {
        any_erasures = false;
        CFeat_CI f(bh);
        while (f) {
            if (f->IsSetData()) {
                const auto& fdata = f->GetData();
                if (fdata.IsOrg() ||
                    (fdata.IsImp() && fdata.GetImp().IsSetKey() &&
                     NStr::Equal(fdata.GetImp().GetKey(), "source"))) {
                    if (src) {
                        // remove import source features if source descriptor already present
                        CSeq_feat_Handle fh(*f);
                        CSeq_feat_EditHandle eh(fh);
                        eh.Remove();
                        any_erasures = true;
                        ChangeMade(CCleanupChange::eRemoveFeat);
                        break;
                    } else {
                        // convert imp-source feature to biosource
                        CRef<CBioSource> bsrc = BioSourceFromImpFeat(*(f->GetSeq_feat()));
                        if (bsrc) {
                            BiosourceBC(*bsrc);
                            CRef<CSeqdesc> d(new CSeqdesc());
                            d->SetSource().Assign(*bsrc);
                            CBioseq_EditHandle eh(bh);
                            eh.SetDescr().Set().push_back(d);
                            ChangeMade(CCleanupChange::eAddDescriptor);
                        }
                    }
                }
            }
            ++f;
        }
    }

}


//SQD-2043 change pop sets to phy sets if taxnames differ
// part of Extended Cleanup
void CNewCleanup_imp::x_ChangePopToPhy(CBioseq_set& bioseq_set)
{
    if (!bioseq_set.IsSetClass() || bioseq_set.GetClass() != CBioseq_set::eClass_pop_set) {
        return;
    }
    bool all_same = true;
    CTypeConstIterator<CBioseq> seqit(ConstBegin(bioseq_set));
    string first_taxname = "";
    bool is_first = true;
    for (; seqit; ++seqit) {
        string taxname = "";
        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(*seqit);
        // Will get the first biosource either from the descriptor
        // or feature.
        CSeqdesc_CI d(bsh, CSeqdesc::e_Source);
        if (d) {
            const CBioSource& src = d->GetSource();
            if (src.IsSetOrg() && src.GetOrg().IsSetTaxname()) {
                taxname = src.GetOrg().GetTaxname();
            }
        } else {
            CFeat_CI f(bsh, CSeqFeatData::e_Biosrc);
            if (f) {
                const CBioSource& src = f->GetData().GetBiosrc();
                if (src.IsSetOrg() && src.GetOrg().IsSetTaxname()) {
                    taxname = src.GetOrg().GetTaxname();
                }
            }
        }

        if (is_first) {
            first_taxname = taxname;
            is_first = false;
            continue;
        }

        // Make sure all the taxnames in the set are the same.
        if (NStr::CompareNocase(first_taxname, taxname) != 0) {
            all_same = false;
            break;
        }
    }
    if (!all_same) {
        bioseq_set.SetClass(CBioseq_set::eClass_phy_set);
        ChangeMade(CCleanupChange::eChangeBioseqSetClass);
    }
}

void CNewCleanup_imp::x_BioseqSetEC( CBioseq_set & bioseq_set )
{
    // put general Bioseq-set cleanup here:
    // ...

    // NOTE: Need to do this first, because cleanup rules may be different for pop and phy
    x_ChangePopToPhy(bioseq_set);

    // special logic for various bioseq_set types:
    switch( GET_FIELD_OR_DEFAULT(
        bioseq_set, Class, NCBI_BIOSEQSETCLASS(not_set)) )
    {
        case CBioseq_set::eClass_nuc_prot:
            x_BioseqSetNucProtEC( bioseq_set );
            break;
        case CBioseq_set::eClass_genbank:
            x_BioseqSetGenBankEC(bioseq_set);
            x_RemovePopPhyBioSource(bioseq_set);
            x_RemovePopPhyMolInfo(bioseq_set);
            break;
        case CBioseq_set::eClass_mut_set:
        case CBioseq_set::eClass_pop_set:
        case CBioseq_set::eClass_phy_set:
        case CBioseq_set::eClass_eco_set:
        case CBioseq_set::eClass_wgs_set:
        case CBioseq_set::eClass_small_genome_set:
            x_RemovePopPhyBioSource(bioseq_set);
            x_RemovePopPhyMolInfo(bioseq_set);
            break;
        default:
            // no special logic for other bioseq-set classes
            break;
    }
}


bool IsPubInSet(const CSeq_descr& descr, const CPubdesc& pub)
{
    bool found_a_match = false;
    ITERATE(CBioseq::TDescr::Tdata, d, descr.Get()) {
        if ((*d)->IsPub() && (*d)->GetPub().Equals(pub)) {
            found_a_match = true;
            break;
        }
    }
    return found_a_match;
}


struct SPubMatch
{
    const CPubdesc& m_Pub;
    bool operator()(CRef<CSeqdesc> dsc)
    {
        return (dsc && dsc->IsPub() && dsc->GetPub().Equals(m_Pub));
    }
};


void CNewCleanup_imp::x_RemovePub(CSeq_entry& se, const CPubdesc& pub)
{
    SPubMatch matcher{ pub };
    if (se.IsSeq()) {
        CBioseq& seq = se.SetSeq();
        if (seq.IsSetDescr()) {
            auto& dset = seq.SetDescr().Set();
            size_t before = dset.size();
            dset.erase(std::remove_if(dset.begin(), dset.end(), matcher), dset.end());
            if (dset.size() != before) {
                ChangeMade(CCleanupChange::eRemoveDescriptor);
            }
        }
    } else if (se.IsSet()) {
        CBioseq_set& set = se.SetSet();
        if (set.IsSetDescr()) {
            auto& dset = set.SetDescr().Set();
            size_t before = dset.size();
            dset.erase(std::remove_if(dset.begin(), dset.end(), matcher), dset.end());
            if (dset.size() != before) {
                ChangeMade(CCleanupChange::eRemoveDescriptor);
            }
        }
    }
}


void CNewCleanup_imp::x_MovePopPhyMutPub(CBioseq_set& bioseq_set)
{
    if (!bioseq_set.IsSetSeq_set() || bioseq_set.GetSeq_set().empty() || !bioseq_set.IsSetClass()) {
        return;
    }
    bool can_consolidate_pubs = false;
    switch (bioseq_set.GetClass()) {
    case CBioseq_set::eClass_mut_set:
    case CBioseq_set::eClass_pop_set:
    case CBioseq_set::eClass_phy_set:
    case CBioseq_set::eClass_eco_set:
    case CBioseq_set::eClass_wgs_set:
    case CBioseq_set::eClass_small_genome_set:
        can_consolidate_pubs = true;
        break;
    default:
        break;
    }
    if (!can_consolidate_pubs) {
        // wrong kind of set
        return;
    }

    CBioseq_set::TSeq_set::const_iterator first = bioseq_set.GetSeq_set().begin();
    if (!(*first)->IsSetDescr()) {
        // no common pubs
        return;
    }

    vector<CRef<CPubdesc> > pubs_to_remove;

    const auto& dset = (*first)->GetDescr().Get();

    for (auto d : dset) {
        if (d->IsPub()) {
            bool found_every_match = true;
            CBioseq_set::TSeq_set::const_iterator other = first;
            ++other;
            while (other != bioseq_set.GetSeq_set().end()) {
                if (!(*other)->IsSetDescr()) {
                    found_every_match = false;
                    break;
                }
                
                if (!IsPubInSet((*other)->GetDescr(), d->GetPub())) {
                    found_every_match = false;
                    break;
                }
                ++other;
            }
            if (found_every_match) {
                if (!bioseq_set.IsSetDescr() || !IsPubInSet(bioseq_set.GetDescr(), d->GetPub())) {
                    // copy pub to parent
                    CRef<CSeqdesc> new_pub(new CSeqdesc());
                    new_pub->Assign(*d);
                    bioseq_set.SetDescr().Set().push_back(new_pub);
                    ChangeMade(CCleanupChange::eAddDescriptor);
                }
                // remove from children
                CRef<CPubdesc> pub_cpy(new CPubdesc());
                pub_cpy->Assign(d->GetPub());
                pubs_to_remove.push_back(pub_cpy);
            }
        }
    }
    NON_CONST_ITERATE(CBioseq_set::TSeq_set, s, bioseq_set.SetSeq_set()) {
        ITERATE(vector<CRef<CPubdesc> >, d, pubs_to_remove) {
            x_RemovePub(**s, **d);
        }
    }

}


// this is for CNewCleanup_imp::x_BioseqSetNucProtEC.
// It's out here because C++ doesn't like templates on
// local types.
namespace {
    // this holds info about the dblinks we've found
    struct SDblinkDeleteInfo
    {
        CSeq_descr_Base::Tdata::iterator  pDBLinkDesc_iter;
        CRef<CBioseq>                     pDBLinkDescBioseq;
    };
}


void CNewCleanup_imp::x_CollapseSet(CBioseq_set& bioseq_set)
{
    if (!bioseq_set.IsSetSeq_set()) {
        return;
    }
    const auto& seqset = bioseq_set.GetSeq_set();
    if (seqset.size() != 1) {
        return;
    }

    const auto& only = seqset.front();

    if (only->IsSet()) {
        CBioseq_set_EditHandle p = m_Scope->GetBioseq_setEditHandle(bioseq_set);
        CSeq_entry_Handle ch = m_Scope->GetSeq_entryHandle(*only);
        const CBioseq_set& child = bioseq_set.GetSeq_set().front()->GetSet();
        if (child.IsSetAnnot()) {
            while (!child.GetAnnot().empty()) {
                CSeq_annot_Handle ah = m_Scope->GetSeq_annotHandle(*(child.GetAnnot().front()));
                CSeq_annot_EditHandle eh = ah.GetEditHandle();
                p.TakeAnnot(eh);
            }
        }
        if (child.IsSetDescr()) {
            const auto& cdset = child.GetDescr().Get();
            for (auto it : cdset) {
                CRef<CSeqdesc> cpy(new CSeqdesc());
                cpy->Assign(*it);
                p.AddSeqdesc(*cpy);
            }
        }
        if (child.IsSetSeq_set()) {
            while (!child.GetSeq_set().empty()) {
                CSeq_entry_Handle h = m_Scope->GetSeq_entryHandle(*(child.GetSeq_set().front()));
                CSeq_entry_EditHandle eh = h.GetEditHandle();
                p.TakeEntry(eh);
            }
        }
        CSeq_entry_EditHandle ech = ch.GetEditHandle();
        ech.Remove();
        ChangeMade(CCleanupChange::eCollapseSet);
    }
}


void CNewCleanup_imp::x_RemovePopPhyBioSource(CBioseq_set& set)
{
    if (!set.IsSetDescr()) {
        return;
    }
    auto& dset = set.SetDescr().Set();
    CBioseq_set::TDescr::Tdata::iterator d = dset.begin();
    while (d != dset.end()) {
        if ((*d)->IsSource()) {
            //propagate down
            if ((*d)->GetSource().IsSetOrg() && 
                ((*d)->GetSource().GetOrg().IsSetTaxname() || (*d)->GetSource().GetOrg().IsSetCommon()) &&
                set.IsSetSeq_set()) {
                NON_CONST_ITERATE(CBioseq_set::TSeq_set, s, set.SetSeq_set()) {
                    if ((*s)->IsSet()) {
                        x_RemovePopPhyBioSource((*s)->SetSet(), (*d)->GetSource().GetOrg());
                    } else if ((*s)->IsSeq()) {
                        x_RemovePopPhyBioSource((*s)->SetSeq(), (*d)->GetSource().GetOrg());
                    }
                }
            }
            d = dset.erase(d);
            ChangeMade(CCleanupChange::eRemoveDescriptor);
        } else {
            ++d;
        }
    }

}


static bool s_HasDescriptorOfType(const CSeq_descr::Tdata& dset, CSeqdesc::E_Choice dtype)
{
    for (auto d : dset) {
        if (d->Which() == dtype) {
            return true;
        }
    }
    return false;
}


static bool s_HasDescriptorOfType(const CBioseq_set& set, CSeqdesc::E_Choice dtype)
{
    bool rval = false;
    if (set.IsSetDescr()) {
        rval = s_HasDescriptorOfType(set.GetDescr().Get(), dtype);
    }
    return rval;
}


static bool s_HasDescriptorOfType(const CBioseq& seq, CSeqdesc::E_Choice dtype)
{
    bool rval = false;
    if (seq.IsSetDescr()) {
        rval = s_HasDescriptorOfType(seq.GetDescr().Get(), dtype);
    }
    return rval;
}


void CNewCleanup_imp::x_RemovePopPhyBioSource(CBioseq_set& set, const COrg_ref& org)
{
    // bail if already have source descriptor
    if (s_HasDescriptorOfType(set, CSeqdesc::e_Source)) {
        return;
    }

    CRef<CSeqdesc> src(new CSeqdesc());
    if (org.IsSetTaxname()) {
        src->SetSource().SetOrg().SetTaxname(org.GetTaxname());
    }
    if (org.IsSetCommon()) {
        src->SetSource().SetOrg().SetCommon(org.GetCommon());
    }
    set.SetDescr().Set().push_back(src);
    ChangeMade(CCleanupChange::eAddDescriptor);
}


void CNewCleanup_imp::x_RemovePopPhyBioSource(CBioseq& seq, const COrg_ref& org)
{
    // bail if already have source descriptor
    if (s_HasDescriptorOfType(seq, CSeqdesc::e_Source)) {
        return;
    }

    CRef<CSeqdesc> src(new CSeqdesc());
    if (org.IsSetTaxname()) {
        src->SetSource().SetOrg().SetTaxname(org.GetTaxname());
    }
    if (org.IsSetCommon()) {
        src->SetSource().SetOrg().SetCommon(org.GetCommon());
    }
    seq.SetDescr().Set().push_back(src);
    ChangeMade(CCleanupChange::eAddDescriptor);
}


void CNewCleanup_imp::x_RemovePopPhyMolInfo(CBioseq_set& set)
{
    if (!set.IsSetDescr()) {
        return;
    }
    auto& dset = set.SetDescr().Set();
    CBioseq_set::TDescr::Tdata::iterator d = dset.begin();
    while (d != dset.end()) {
        if ((*d)->IsMolinfo()) {
            //propagate down
            NON_CONST_ITERATE(CBioseq_set::TSeq_set, s, set.SetSeq_set()) {
                if ((*s)->IsSet()) {
                    x_RemovePopPhyMolInfo((*s)->SetSet(), (*d)->GetMolinfo());
                } else if ((*s)->IsSeq()) {
                    x_RemovePopPhyMolInfo((*s)->SetSeq(), (*d)->GetMolinfo());
                }
            }
            d = dset.erase(d);
            ChangeMade(CCleanupChange::eRemoveDescriptor);
        } else {
            ++d;
        }
    }

}


void CNewCleanup_imp::x_RemovePopPhyMolInfo(CBioseq_set& set, const CMolInfo& mol)
{
    // bail if already have molinfo descriptor
    if (s_HasDescriptorOfType(set, CSeqdesc::e_Molinfo)) {
        return;
    }

    CRef<CSeqdesc> mi(new CSeqdesc());
    mi->SetMolinfo().Assign(mol);
    set.SetDescr().Set().push_back(mi);
    ChangeMade(CCleanupChange::eAddDescriptor);
}


void CNewCleanup_imp::x_RemovePopPhyMolInfo(CBioseq& seq, const CMolInfo& mol)
{
    // bail if already have MolInfo descriptor
    if (s_HasDescriptorOfType(seq, CSeqdesc::e_Molinfo)) {
        return;
    }

    CRef<CSeqdesc> mi(new CSeqdesc());
    mi->SetMolinfo().Assign(mol);
    seq.SetDescr().Set().push_back(mi);
    ChangeMade(CCleanupChange::eAddDescriptor);
}


void CNewCleanup_imp::x_MoveNPTitle(CBioseq_set& set)
{
    if (!set.IsSetDescr() || !set.IsSetSeq_set()) {
        return;
    }
    const auto& dset = set.GetDescr().Get();
    CConstRef<CSeqdesc> set_title(NULL);
    for (auto d : dset) {
        if (d->IsTitle()) {
            set_title = d;
        }
    }
    if (!set_title) {
        return;
    }
    bool have_nuc_title = false;
    ITERATE(CBioseq_set::TSeq_set, it, set.GetSeq_set()) {
        if ((*it)->IsSeq() && (*it)->GetSeq().IsNa()) {
            const auto& idset = (*it)->GetSeq().GetDescr().Get();
            for (auto d : idset) {
                if (d->IsTitle()) {
                    have_nuc_title = true;
                    break;
                }
            }
            if (!have_nuc_title) {
                CRef<CSeqdesc> new_title(new CSeqdesc());
                new_title->Assign(*set_title);
                CBioseq_Handle b = m_Scope->GetBioseqHandle((*it)->GetSeq());
                CBioseq_EditHandle eh = b.GetEditHandle();
                eh.AddSeqdesc(*new_title);
                ChangeMade(CCleanupChange::eAddDescriptor);
                have_nuc_title = true;
            }
        }
    }
    if (have_nuc_title) {
        //either we already had a nuc title or we copied the one from the set
        //now remove set title
        CBioseq_set_Handle b = m_Scope->GetBioseq_setHandle(set);
        CBioseq_set_EditHandle eh = b.GetEditHandle();
        CSeq_descr::Tdata::iterator d = eh.SetDescr().Set().begin();
        while (d != eh.SetDescr().Set().end()) {
            if ((*d)->IsTitle()) {
                d = eh.SetDescr().Set().erase(d);
                ChangeMade(CCleanupChange::eRemoveDescriptor);
            } else {
                ++d;
            }
        }
        if (eh.SetDescr().Set().empty()) {
            eh.ResetDescr();
        }
    }
}


void CNewCleanup_imp::x_BioseqSetNucProtEC(CBioseq_set & bioseq_set)
{
    // clean up nested Nuc-Prot sets
    x_RemoveNestedNucProtSet(bioseq_set);

    x_MoveNpSrc(bioseq_set);
    x_MoveNpPub(bioseq_set);
    x_MoveNpDBlinks(bioseq_set);
    x_MoveNPTitle(bioseq_set);
}


void CNewCleanup_imp::x_RemoveNestedGenBankSet(CBioseq_set & bioseq_set)
{
    if (bioseq_set.IsSetSeq_set() && bioseq_set.GetSeq_set().size() == 1 &&
        bioseq_set.GetSeq_set().front()->IsSet()) {
        const auto& inner_set = bioseq_set.GetSeq_set().front()->GetSet();
        if (inner_set.IsSetClass() &&
            inner_set.GetClass() == CBioseq_set::eClass_genbank &&
            (bioseq_set.GetParentSet() != NULL || !m_KeepTopNestedSet)) {
            x_CollapseSet(bioseq_set);
        }
    }
    
}


void CNewCleanup_imp::x_RemoveNestedNucProtSet(CBioseq_set & bioseq_set)
{
    if (bioseq_set.IsSetClass() &&
        bioseq_set.GetClass() == CBioseq_set::eClass_nuc_prot &&
        bioseq_set.IsSetSeq_set() && bioseq_set.GetSeq_set().size() == 1 &&    
        bioseq_set.GetSeq_set().front()->IsSet()) {
        const auto& inner_set = bioseq_set.GetSeq_set().front()->GetSet();
        if (inner_set.IsSetClass() &&
            inner_set.GetClass() == CBioseq_set::eClass_nuc_prot) {
            x_CollapseSet(bioseq_set);
        }
    }
    
}


void CNewCleanup_imp::x_BioseqSetGenBankEC(CBioseq_set & bioseq_set)
{
    // clean up nested GenBank sets
    x_RemoveNestedGenBankSet(bioseq_set);
    //propagate source descriptors to set components
    if (bioseq_set.IsSetDescr() && bioseq_set.IsSetSeq_set() && !bioseq_set.GetSeq_set().empty()) {
        auto& dset = bioseq_set.SetDescr().Set();
        CBioseq_set::TDescr::Tdata::iterator it = dset.begin();
        while (it != dset.end()) {
            if ((*it)->IsSource()) {
                NON_CONST_ITERATE(CBioseq_set::TSeq_set, s, bioseq_set.SetSeq_set()) {
                    CRef<CSeqdesc> cpy(new CSeqdesc());
                    cpy->Assign(**it);
                    if ((*s)->IsSeq()) {
                        (*s)->SetSeq().SetDescr().Set().push_back(cpy);
                    } else if ((*s)->IsSet()) {
                        (*s)->SetSet().SetDescr().Set().push_back(cpy);
                    }
                }
                it = dset.erase(it);
                ChangeMade(CCleanupChange::eAddDescriptor);
                ChangeMade(CCleanupChange::eRemoveDescriptor);
            } else {
                ++it;
            }
        }
        if (dset.empty()) {
            bioseq_set.ResetDescr();
        }
    }
}


void CNewCleanup_imp::x_MoveNpDBlinks(CBioseq_set& bioseq_set)
{

    // if nuc-prot set has exactly one DBLink user-object on its
    // descendent bioseqs, move it to the nuc-prot set.
    // (identical DBLinks count as one)

    // bail if there is a DBLinkDesc on the bioseq_set itself
    if (bioseq_set.IsSetDescr()) {
        const auto& dset = bioseq_set.GetDescr().Get();
        for (auto desc_it : dset) {
            if (x_IsDBLinkUserObj(*desc_it)) {
                return;
            }
        }
    }

    typedef vector<SDblinkDeleteInfo> TDblinkDeleteInfoVec;
    TDblinkDeleteInfoVec dblinksToDeleteVec;
    
    // check for descendent dblinks
    VISIT_ALL_SEQENTRYS_WITHIN_SEQSET( entry_it, bioseq_set )
    {
        CRef<CSeq_entry> pEntry( & const_cast<CSeq_entry&>(*entry_it) );
        if (pEntry->IsSetDescr()) {
            auto& dset = pEntry->SetDescr().Set();
            auto desc_it = dset.begin();
            while (desc_it != dset.end())
            {
                if (!x_IsDBLinkUserObj(**desc_it)) {
                    // ignore other types of user objects
                    ++desc_it;
                    continue;
                }

                if (!pEntry->IsSeq()) {
                    // Found a DBLink on some descendent bioseq-set,
                    // so we bail out
                    return;
                }

                // there has already been a dblink.  make sure it's
                // identical
                if (!dblinksToDeleteVec.empty()) {
                    const CSeqdesc & last_dblink =
                        **dblinksToDeleteVec.rbegin()->pDBLinkDesc_iter;
                    // bail out if there is more than one DBLink user object,
                    // and they are NOT identical
                    if (!(*desc_it)->Equals(last_dblink)) {
                        return;
                    }
                }

                SDblinkDeleteInfo dblink_to_delete;
                dblink_to_delete.pDBLinkDesc_iter = desc_it;
                dblink_to_delete.pDBLinkDescBioseq = Ref(&pEntry->SetSeq());
                dblinksToDeleteVec.push_back(dblink_to_delete);
                ++desc_it;
            }
        }
    }

    // delete dblinks that we're s
    if( ! dblinksToDeleteVec.empty() ) {
        // give the parent bioseq-set a copy of the dblink
        CRef<CSeqdesc> pDblinkForParent( 
            SerialClone(
            *dblinksToDeleteVec.begin()->pDBLinkDesc_iter->GetPointer()) );
        ADD_SEQDESC_TO_SEQSET(bioseq_set, pDblinkForParent);

        // delete the dblinks below the parent
        NON_CONST_ITERATE( 
            TDblinkDeleteInfoVec, dblink_delete_info, dblinksToDeleteVec ) 
        {
            ERASE_SEQDESC_ON_BIOSEQ(
                dblink_delete_info->pDBLinkDesc_iter,
                *dblink_delete_info->pDBLinkDescBioseq );
        }
    }
}


void CNewCleanup_imp::x_MoveNpSrc(CRef<CSeqdesc>& srcdesc, CSeq_descr& descr)
{
    auto& dset = descr.Set();
    CBioseq::TDescr::Tdata::iterator d = dset.begin();
    while (d != dset.end()) {
        bool do_remove = false;
        if ((*d)->IsSource()) {
            if (srcdesc && CCleanup::AreBioSourcesMergeable(srcdesc->GetSource(), (*d)->GetSource())) {
                CCleanup::MergeDupBioSources(srcdesc->SetSource(), (*d)->GetSource());
                do_remove = true;
            } else if (!srcdesc) {
                srcdesc.Reset(new CSeqdesc());
                srcdesc->Assign(**d);
                do_remove = true;
            }
        } 
        if (do_remove) {
            d = dset.erase(d);
        } else {
            ++d;
        }
    }

}


void CNewCleanup_imp::x_MoveNpSrc(CBioseq_set& set)
{
    if (!set.IsSetClass() || set.GetClass() != CBioseq_set::eClass_nuc_prot ||
        !set.IsSetSeq_set()) {
        return;
    }
    
    bool add_desc = true;
    CRef<CSeqdesc> srcdesc(NULL);
    if (set.IsSetDescr()) {
        auto& dset = set.SetDescr().Set();
        for (auto& it : dset) {
            if (it->IsSource()) {
                srcdesc = it;
                add_desc = false;
            }
        }
    }
    
    for (auto it : set.SetSeq_set()) {
        if (it->IsSetDescr()) {
            if (it->IsSeq()) {
                x_MoveNpSrc(srcdesc, it->SetSeq().SetDescr());
            } else if (it->IsSet()) {
                x_MoveNpSrc(srcdesc, it->SetSet().SetDescr());
            }
        }
    }
    if (add_desc && srcdesc) {
        set.SetDescr().Set().push_back(srcdesc);
    }
}


void CNewCleanup_imp::x_MoveNpPub(CBioseq_set& np_set, CSeq_descr& descr)
{
    auto& dset = descr.Set();
    CSeq_descr::Tdata::iterator d = dset.begin();
    while (d != dset.end()) {
        if ((*d)->IsPub() && np_set.IsSetDescr() && CCleanup::PubAlreadyInSet((*d)->GetPub(), np_set.GetDescr())) {
            d = dset.erase(d);
            ChangeMade(CCleanupChange::eRemoveDescriptor);
        } else if ((*d)->IsPub() && CCleanup::OkToPromoteNpPub((*d)->GetPub())) {
            CRef<CSeqdesc> new_desc(new CSeqdesc());
            new_desc->Assign(**d);
            np_set.SetDescr().Set().push_back(new_desc);
            d = descr.Set().erase(d);
            ChangeMade(CCleanupChange::eMoveDescriptor);
        } else {
            ++d;
        }
    }

}


bool s_HasRefSeqPGAPStructuredComment(const CSeq_entry_Handle& seh)
{
    CSeqdesc_CI di(seh, CSeqdesc::e_User);
    while (di) {
        const auto& user = di->GetUser();
        if (user.HasField("StructuredCommentPrefix") && user.HasField("Annotation Provider")) {
            const CUser_field& field = user.GetField("StructuredCommentPrefix");
            const CUser_field& provider = user.GetField("Annotation Provider");
            if (field.IsSetData() && field.GetData().IsStr() &&
                NStr::EqualNocase(field.GetData().GetStr(), "##Genome-Annotation-Data-START##") &&
                provider.IsSetData() && provider.GetData().IsStr() &&
                NStr::EqualNocase(provider.GetData().GetStr(), "NCBI RefSeq")) {
                return true;
            }
        }
        ++di;
    }
    return false;
}


void CNewCleanup_imp::x_MoveNpPub(CBioseq_set& set)
{
    if (!set.IsSetClass() || set.GetClass() != CBioseq_set::eClass_nuc_prot ||
        !set.IsSetSeq_set()) {
        return;
    }

    NON_CONST_ITERATE(CBioseq_set::TSeq_set, it, set.SetSeq_set()) {
        if ((*it)->IsSetDescr()) {
            if ((*it)->IsSeq() && (*it)->GetSeq().IsSetDescr() && CCleanup::OkToPromoteNpPub((*it)->GetSeq())) {
                CSeq_entry_Handle seh = m_Scope->GetSeq_entryHandle(**it);
                if (seh && s_HasRefSeqPGAPStructuredComment(seh)) {
                    continue;
                }
                auto& seq = (*it)->SetSeq();
                auto& dset = seq.SetDescr();
                x_MoveNpPub(set, dset);
                if (dset.Set().empty()) {
                    seq.ResetDescr();
                }
            } else if ((*it)->IsSet() && (*it)->GetSet().IsSetDescr()) {
                CSeq_entry_Handle seh = m_Scope->GetSeq_entryHandle(**it);
                if (seh && s_HasRefSeqPGAPStructuredComment(seh)) {
                    continue;
                }
                auto& set = (*it)->SetSet();
                auto& dset = set.SetDescr();
                x_MoveNpPub(set, dset);
                if (dset.Set().empty()) {
                    set.ResetDescr();
                }
            }
        }
    }
}


bool CNewCleanup_imp::x_IsDBLinkUserObj( const CSeqdesc & desc )
{
        if( ! desc.IsUser() ) {
            return false;
        }
        return (desc.GetUser().GetObjectType() == CUser_object::eObjectType_DBLink);
}

// neg for "<", 0 for "==", and pos for ">"
static int s_PcrPrimerSetCompare( const CPCRPrimerSet &s1, const CPCRPrimerSet &s2 )
{
    // it's highly unlikely for this if-statement to trigger, but just in case...
    if( ! s1.IsSet() || ! s2.IsSet() ) {
        return int(s1.IsSet()) - int(s2.IsSet());
    }

    // put the primers into a set so that our comparison doesn't worry about order or dups
    typedef set< CRef<CPCRPrimer>, CPcrPrimerRefLessThan > TPrimerContainer;
    TPrimerContainer primer_set_1;
    TPrimerContainer primer_set_2;

    copy( s1.Get().begin(), s1.Get().end(), inserter(primer_set_1, primer_set_1.begin()) );
    copy( s2.Get().begin(), s2.Get().end(), inserter(primer_set_2, primer_set_2.begin()) );

    // smaller first
    if( primer_set_1.size() != primer_set_2.size() ) {
        return int(primer_set_1.size()) - int(primer_set_2.size());
    }

    // find so we can compare
    pair<TPrimerContainer::const_iterator, TPrimerContainer::const_iterator> mismatch_iter = 
        mismatch( primer_set_1.begin(), primer_set_1.end(), primer_set_2.begin(), CPCRPrimerRefEqual() );
    if( mismatch_iter.first == primer_set_1.end() ) {
        // no mismatch; they're equal
        return 0;
    }

    const int mismatch_compare = s_PcrPrimerCompare(*mismatch_iter.first, *mismatch_iter.second);
    return mismatch_compare;
}

class CPcrReactionLessThan {
public:

    bool operator()( 
        const CRef<CPCRReaction> &r1, const CRef<CPCRReaction> &r2 ) const
    {
        if( r1.IsNull() || r2.IsNull() ) {
            return r1.IsNull() && ! r2.IsNull();
        }

        // compare on forward, then reverse
        if( r1->IsSetForward() != r2->IsSetForward() ) {
            // note where the "!" operator is and isn't
            return ! r1->IsSetForward() && r2->IsSetForward(); 
        }
        if( r1->IsSetForward() && r2->IsSetForward() ) {
            const int forward_comparison = s_PcrPrimerSetCompare( r1->GetForward(), r2->GetForward() );
            if( forward_comparison != 0 ) {
                return (forward_comparison < 0);
            }
        }

        if( r1->IsSetReverse() != r2->IsSetReverse() ) {
            // note where the "!" operator is and isn't
            return ! r1->IsSetReverse() && r2->IsSetReverse(); 
        }
        if( ! r1->IsSetReverse() && ! r2->IsSetReverse() ) {
            // note where the "!" operator is and isn't
            return ! r1->IsSetReverse() && r2->IsSetReverse();
        } 
        return ( s_PcrPrimerSetCompare( r1->GetReverse(), r2->GetReverse() ) < 0 );
    }

};

void CNewCleanup_imp::PCRReactionSetBC( CPCRReactionSet &pcr_reaction_set )
{
    EDIT_EACH_PCRREACTION_IN_PCRREACTIONSET( reaction_iter, pcr_reaction_set ) {
        CPCRReaction &reaction = **reaction_iter;

        if( FIELD_IS_SET(reaction, Forward) ) {
            x_PCRPrimerSetBC( GET_MUTABLE(reaction, Forward) );
            if( ! GET_FIELD(reaction, Forward).IsSet() || GET_FIELD(reaction, Forward).Get().empty() ) {
                RESET_FIELD(reaction, Forward);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if( FIELD_IS_SET(reaction, Reverse) ) {
            x_PCRPrimerSetBC( GET_MUTABLE(reaction, Reverse) );
            if( ! GET_FIELD(reaction, Reverse).IsSet() || GET_FIELD(reaction, Reverse).Get().empty() ) {
                RESET_FIELD(reaction, Reverse);
                ChangeMade(CCleanupChange::eChangePCRPrimers);
            }
        }

        if( ! FIELD_IS_SET(reaction, Forward) && ! FIELD_IS_SET(reaction, Reverse) ) {
            ERASE_PCRREACTION_IN_PCRREACTIONSET(reaction_iter, pcr_reaction_set);
            ChangeMade(CCleanupChange::eChangePCRPrimers);
        }
    }

    UNIQUE_WITHOUT_SORT_PCRREACTION_IN_PCRREACTIONSET( pcr_reaction_set, CPcrReactionLessThan );

    REMOVE_IF_EMPTY_PCRREACTION_IN_PCRREACTIONSET( pcr_reaction_set );
}

void CNewCleanup_imp::MolInfoBC( CMolInfo &molinfo )
{
    if( FIELD_EQUALS(molinfo, Tech, NCBI_BIOMOL(unknown) ) ) {
        RESET_FIELD(molinfo, Tech);
        ChangeMade(CCleanupChange::eChangeMolInfo);
    }

    if( FIELD_EQUALS(molinfo, Completeness, NCBI_COMPLETENESS(unknown) ) ) {
        RESET_FIELD(molinfo, Completeness);
        ChangeMade(CCleanupChange::eChangeMolInfo);
    }
}

// part of ExtendedCleanup
void CNewCleanup_imp::CreateMissingMolInfo( CBioseq& seq )
{
    if (seq.IsSetInst() && seq.GetInst().IsSetMol()) {
        bool is_product = false;
        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
        CFeat_CI f(bsh);
        CBioseq_set_Handle p = bsh.GetParentBioseq_set();
        if (p && p.IsSetClass() && p.GetClass() == CBioseq_set::eClass_nuc_prot) {
            p = p.GetParentBioseq_set();
        }
        if (p && p.IsSetClass() && p.GetClass() == CBioseq_set::eClass_gen_prod_set) {
            if (seq.IsAa() && sequence::GetCDSForProduct(seq, m_Scope) != NULL) {
                is_product = true;
            } else if (seq.GetInst().GetMol() == CSeq_inst::eMol_rna &&
                       sequence::GetmRNAForProduct(seq, m_Scope) != NULL) {
                is_product = true;
            }
        }
        if (CCleanup::AddMissingMolInfo(seq, is_product)) {
            ChangeMade(CCleanupChange::eChangeMolInfo);
        }
    }
}


void CNewCleanup_imp::AddProteinTitles(CBioseq& seq)
{
    if (!(m_Options & CCleanup::eClean_NoProteinTitles)) {
        // don't add if there is already a title directly on the sequence
        if (seq.IsSetDescr()) {
            const auto& dset = seq.GetDescr().Get();
            for (auto it : dset) {
                if (it->IsTitle()) {
                    return;
                }
            }
        }


        CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
        if (CCleanup::AddProteinTitle(bsh)) {
            ChangeMade(CCleanupChange::eAddDescriptor);
        }
    }
}


void CNewCleanup_imp::ProtRefEC(CProt_ref& pr)
{
    if (pr.IsSetDesc()) {
        string desc = pr.GetDesc();
        TrimInternalSemicolons(desc);
        if (!NStr::Equal(desc, pr.GetDesc())) {
            pr.SetDesc(desc);
            ChangeMade(CCleanupChange::eChangeOther);
        }
    }
    if (pr.IsSetEc()) {
        x_CleanupECNumberListEC(pr.SetEc());
    }
}


bool CNewCleanup_imp::x_FixParentPartials(const CSeq_feat& sf, CSeq_feat& parent)
{
    bool any_changes = false;
    if (!sf.IsSetLocation() || !parent.IsSetLocation()) {
        // note - this is pathological
        return any_changes;
    }
    const auto& floc = sf.GetLocation();
    const auto& ploc = parent.GetLocation();
    if (floc.IsPartialStart(eExtreme_Biological) &&
        !ploc.IsPartialStart(eExtreme_Biological) &&
        floc.GetStart(eExtreme_Biological) == ploc.GetStart(eExtreme_Biological)) {
        parent.SetLocation().SetPartialStart(true, eExtreme_Biological);
        parent.SetPartial(true);
        any_changes = true;
    }
    if (floc.IsPartialStop(eExtreme_Biological) &&
        !ploc.IsPartialStop(eExtreme_Biological) &&
        floc.GetStop(eExtreme_Biological) == ploc.GetStop(eExtreme_Biological)) {
        parent.SetLocation().SetPartialStop(true, eExtreme_Biological);
        parent.SetPartial(true);
        any_changes = true;
    }
    return any_changes;
}


void CNewCleanup_imp::CdRegionEC(CSeq_feat& sf)
{
    if (!sf.IsSetData() || !sf.GetData().IsCdregion()) {
        return;
    }

    CCdregion& cdr = sf.SetData().SetCdregion();

    if (cdr.IsSetConflict() && 
        cdr.GetConflict() &&
        sf.IsSetProduct()) {
        try {
            CBioseq_Handle nuc = m_Scope->GetBioseqHandle(sf.GetLocation());
            if (nuc) {
                CSeqdesc_CI src(nuc, CSeqdesc::e_Source);
                if (src && src->GetSource().IsSetGcode()) {
                    CBioseq_Handle prot = m_Scope->GetBioseqHandle(sf.GetProduct());
                    string expected;
                    CSeqTranslator::Translate(sf, *m_Scope, expected, false);
                    CSeqVector vec(prot, CBioseq_Handle::eCoding_Iupac);
                    CSeqVector_CI vi = vec.begin();
                    string::iterator si = expected.begin();
                    while (vi != vec.end() && si != expected.end() && *vi == *si) {
                        ++vi;
                        ++si;
                    }
                    if (vi != vec.end() || si != expected.end()) {
                        if (CCleanup::SetMolinfoTech(prot, CMolInfo::eTech_concept_trans_a)) {
                            ChangeMade(CCleanupChange::eChangeMolInfo);
                        }
                    } else {
                        cdr.ResetConflict();
                        ChangeMade(CCleanupChange::eChangeOther);
                    }
                }
            }
        } catch (CException&) {
            // unable to get bioseq handle when loc is on multiple sequences
        }
    }

    if (!m_IsEmblOrDdbj) {
        try {
            CRef<CSeq_feat> cds_cpy(new CSeq_feat());
            cds_cpy->Assign(sf);
            CConstRef<CSeq_feat> mrna = sequence::GetmRNAforCDS(*cds_cpy, *m_Scope);
            CRef<CSeq_feat> new_mrna(NULL);
            if (mrna) {
                new_mrna.Reset(new CSeq_feat());
                new_mrna->Assign(*mrna);
            }
            bool altered_mrna = false;
            CConstRef<CSeq_feat> gene = sequence::GetGeneForFeature(*cds_cpy, *m_Scope);
            CRef<CSeq_feat> new_gene(NULL);
            if (gene) {
                new_gene.Reset(new CSeq_feat());
                new_gene->Assign(*gene);
            }
            bool altered_gene = false;

            CBioseq_Handle bsh = m_Scope->GetBioseqHandle(sf.GetLocation());
            if (bsh && CCleanup::ExtendToStopIfShortAndNotPartial(sf, bsh)) {
                if (new_gene && CCleanup::LocationMayBeExtendedToMatch(new_gene->GetLocation(), sf.GetLocation())) {
                    if (CCleanup::ExtendStopPosition(*new_gene, &sf)) {
                        altered_gene = true;
                    }
                }
                if (new_mrna && CCleanup::LocationMayBeExtendedToMatch(new_mrna->GetLocation(), sf.GetLocation())) {
                    if (CCleanup::ExtendStopPosition(*new_mrna, &sf)) {
                        altered_mrna = true;
                    }
                }
                ChangeMade(CCleanupChange::eChangeFeatureLocation);
            }
            if (new_gene && x_FixParentPartials(sf, *new_gene)) {
                altered_gene = true;
            }
            if (new_mrna && x_FixParentPartials(sf, *new_mrna)) {
                altered_mrna = true;
            }
            if (new_mrna && new_gene && x_FixParentPartials(*new_mrna, *new_gene)) {
                altered_gene = true;
            }
            if (altered_gene) {
                CSeq_feat_EditHandle efh = CSeq_feat_EditHandle(m_Scope->GetSeq_featHandle(*gene));
                efh.Replace(*new_gene);
                ChangeMade(CCleanupChange::eChangeFeatureLocation);
            }
            if (altered_mrna) {
                CSeq_feat_EditHandle efh = CSeq_feat_EditHandle(m_Scope->GetSeq_featHandle(*mrna));
                efh.Replace(*new_mrna);
                ChangeMade(CCleanupChange::eChangeFeatureLocation);
            }
        } catch (CException& ) {
            // unable to get bioseq handle when loc is on multiple sequences
        }
    }

    if (sf.IsSetPseudo() && sf.GetPseudo() && sf.IsSetProduct()) {
        if (CCleanup::RemovePseudoProduct(sf, *m_Scope)) {
            ChangeMade(CCleanupChange::eChangeCdregion);
        }
    } else if (sf.IsSetProduct()) {
        //resynch protein molinfo
        CBioseq_Handle prot = m_Scope->GetBioseqHandle(sf.GetProduct());
        if (prot) {
            bool partial5 = sf.GetLocation().IsPartialStart(eExtreme_Biological);
            bool partial3 = sf.GetLocation().IsPartialStop(eExtreme_Biological);     
            bool feat_partial = sf.IsSetPartial() ? sf.GetPartial() : false;
            x_SetPartialsForProtein(*(const_cast<CBioseq *>(prot.GetCompleteBioseq().GetPointer())), partial5, partial3, feat_partial);
        }
    }
}


bool CNewCleanup_imp::IsSyntheticConstruct(const CBioSource& src)
{
    if (!src.IsSetOrigin() || src.GetOrigin() != CBioSource::eOrigin_artificial) {
        return false;
    }
    if (!src.IsSetOrg() || !src.GetOrg().IsSetTaxname() || !NStr::EqualNocase(src.GetOrg().GetTaxname(), "synthetic construct")) {
        return false;
    }
    return true;
}


void CNewCleanup_imp::x_ExtendFeatureToCoverSequence(CSeq_feat_Handle fh, const CBioseq& seq)
{
    const auto& loc = fh.GetLocation();
    if (loc.IsInt() &&
        loc.GetStart(eExtreme_Biological) == 0 &&
        loc.GetStop(eExtreme_Biological) == seq.GetLength() - 1) {
        // already full length, no need to change
        return;
    }

    bool partial_start = loc.IsPartialStart(eExtreme_Biological);
    bool partial_stop = loc.IsPartialStop(eExtreme_Biological);

    CRef<CSeq_feat> new_feat(new CSeq_feat());
    new_feat->Assign(*(fh.GetSeq_feat()));
    auto& new_loc = new_feat->SetLocation();
    auto& new_int = new_loc.SetInt();
    new_int.SetId().Assign(*(fh.GetLocation().GetId()));
    new_int.SetFrom(0);
    new_int.SetTo(seq.GetLength() - 1);
    new_loc.SetPartialStart(partial_start, eExtreme_Biological);
    new_loc.SetPartialStop(partial_stop, eExtreme_Biological);

    CSeq_feat_EditHandle eh(fh);
    eh.Replace(*new_feat);
    ChangeMade(CCleanupChange::eChangeFeatureLocation);

}


// part of ExtendedCleanup
void CNewCleanup_imp::x_ExtendProteinFeatureOnProteinSeq(CBioseq& seq)
{
    // don't bother unless length greater than zero and protein
    if (!seq.IsSetInst()) {
        return;
    }
    const auto& inst = seq.GetInst();
    if (!inst.IsSetLength() ||
        inst.GetLength() == 0 ||
        !inst.IsSetMol() ||
        !inst.IsAa()) {
        return;
    }
    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
    if (!bsh) {
        return;
    }

    CFeat_CI f(bsh, CSeqFeatData::eSubtype_prot);
    if (!f) {
        // no feature to adjust
        return;
    }

    const auto& loc = f->GetLocation();
    if (loc.IsInt() &&
        loc.GetStart(eExtreme_Biological) == 0 &&
        loc.GetStop(eExtreme_Biological) == seq.GetLength() - 1) {
        // already full length, no need to change
        return;
    }

    x_ExtendFeatureToCoverSequence(*f, seq);
}


// part of ExtendedCleanup
void CNewCleanup_imp::x_ExtendSingleGeneOnMrna(CBioseq& seq)
{
    // don't bother unless length greater than zero and mRNA
    if (!seq.IsSetInst()) {
        return;
    }
    const auto& inst = seq.GetInst();
    if (!inst.IsSetLength() ||
        inst.GetLength() == 0 ||
        !inst.IsSetMol() ||
        !inst.IsNa()) {
        return;
    }
    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
    if (!bsh) {
        return;
    }

    CSeqdesc_CI m(bsh, CSeqdesc::e_Molinfo);
    if (!m || !m->GetMolinfo().IsSetBiomol() || m->GetMolinfo().GetBiomol() != CMolInfo::eBiomol_mRNA) {
        return;
    }

    // skip if synthetic construct
    CSeqdesc_CI s(bsh, CSeqdesc::e_Source);
    if (s && IsSyntheticConstruct(s->GetSource())) { 
        return;
    }

    CFeat_CI f(bsh);
    size_t num_gene = 0;
    size_t num_mrna = 0;
    size_t num_cds = 0;
    CConstRef<CSeq_feat> gene(NULL);
    while (f) {
        if (f->IsSetData()) {
            const auto& fdata = f->GetData();
            if (fdata.IsGene()) {
                num_gene++;
                if (num_gene > 1) {
                    // bail if more than one gene
                    break;
                }
                gene.Reset(f->GetSeq_feat());
            } else if (fdata.IsCdregion()) {
                num_cds++;
                if (num_cds > 1) {
                    // bail if more than one CDS
                    break;
                }
            } else if (fdata.GetSubtype() == CSeqFeatData::eSubtype_mRNA) {
                num_mrna++;
                if (num_mrna > 1) {
                    // bail if more than one mRNA
                    break;
                }
            }
        }
        ++f;
    }

    if (num_gene != 1 || num_cds > 1 || num_mrna > 1) {
        return;
    }

    if (!gene->GetLocation().IsInt()) {
        // bail if location is multi-interval and sequence is EMBL or DDBJ
        ITERATE(CBioseq::TId, id, seq.GetId()) {
            if ((*id)->IsDdbj() || (*id)->IsEmbl()) {
                return;
            }
        }
    }

    CSeq_feat_Handle fh = m_Scope->GetSeq_featHandle(*gene);
    x_ExtendFeatureToCoverSequence(fh, seq);
}


void CNewCleanup_imp::MoveDbxrefs(CSeq_feat& sf)
{
    if (!sf.IsSetQual()) {
        return;
    }
    auto& quals = sf.SetQual();
    CSeq_feat::TQual::iterator it = quals.begin();
    while (it != quals.end()) {
        if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::Equal((*it)->GetQual(), "db_xref")) {
            string val = (*it)->GetVal();
            string tag, db;
            CRef<CDbtag> dbp(new CDbtag);

            if (NStr::SplitInTwo(val, ":", db, tag)) {
                dbp->SetDb(db);
                dbp->SetTag().SetStr(tag);
            } else {
                dbp->SetDb("?");
                dbp->SetTag().SetStr(val);
            }
            sf.SetDbxref().push_back(dbp);
            ChangeMade(CCleanupChange::eChangeDbxrefs);
            ChangeMade(CCleanupChange::eRemoveQualifier);
            it = quals.erase(it);
        } else {
            ++it;
        }
    }
    if (sf.GetQual().empty()) {
        sf.ResetQual();
    } 

    if (sf.IsSetDbxref()) {
        // sort/unique db_xrefs
        if (!DBXREF_ON_SEQFEAT_IS_SORTED(sf, s_DbtagCompare)) {
            SORT_DBXREF_ON_SEQFEAT(sf, s_DbtagCompare);
            ChangeMade(CCleanupChange::eCleanDbxrefs);
        }
    }
}


void CNewCleanup_imp::MoveStandardName(CSeq_feat& sf)
{
    if (!sf.IsSetData()) {
        return;
    }
    // only for rRNAs
    const auto& fdata = sf.GetData();
    if (!fdata.IsRna()) {
        return;
    }
    const auto& rna = fdata.GetRna();
    if (!rna.IsSetType() || rna.GetType() == CRNA_ref::eType_tmRNA) {
        return;
    }
    if (rna.GetType() == CRNA_ref::eType_tRNA &&
        rna.IsSetExt() &&
        rna.GetExt().IsTRNA() &&
        !s_IsEmpty(rna.GetExt().GetTRNA())) {
        return;
    }

    // not for EMBL or DDBJ
    if (m_IsEmblOrDdbj) {
        return;
    }

    if (!sf.IsSetQual()) {
        return;
    }

    auto& quals = sf.SetQual();
    CSeq_feat::TQual::iterator it = quals.begin();
    while (it != quals.end()) {
        if ((*it)->IsSetQual() && (*it)->IsSetVal() && NStr::Equal((*it)->GetQual(), "standard_name")) {
            string val = (*it)->GetVal();
            const string product = sf.GetData().GetRna().GetRnaProductName();
            if (NStr::IsBlank(product)) {
                string remainder = "";
                sf.SetData().SetRna().SetRnaProductName(val, remainder);
                val = remainder;
                ChangeMade(CCleanupChange::eChangeRNAref);
            }
            if (!NStr::IsBlank(val)) {
                if (sf.IsSetComment()) {
                    val = sf.GetComment() + "; " + val;
                }
                sf.SetComment(val);
                ChangeMade(CCleanupChange::eRemoveQualifier);
            }
            it = quals.erase(it);
        } else {
            ++it;
        }
    }
    if (sf.GetQual().empty()) {
        sf.ResetQual();
    }
}


void CNewCleanup_imp::CreatePubFromFeat(CSeq_feat& feat)
{
}


void CNewCleanup_imp::ResynchProteinPartials ( CSeq_feat& feat )
{
    if (!feat.IsSetData()) {
        return;
    }
    const auto& fdata = feat.GetData();
    if (!fdata.IsProt()) {
        return;
    }
    const auto& pdata = fdata.GetProt();

    if (pdata.IsSetProcessed() &&
        pdata.GetProcessed() != CProt_ref::eProcessed_not_set) {
        // not a "real" protein feature, just set feature partial 
        // to match location partial
        const unsigned int partial_loc =
            sequence::SeqLocPartialCheck(feat.GetLocation(), m_Scope);
        if (partial_loc == sequence::eSeqlocPartial_Complete &&
            feat.IsSetPartial() && feat.GetPartial()) {
            feat.ResetPartial();
            ChangeMade(CCleanupChange::eChangePartial);
        }
        return;
    }

    CBioseq_Handle prot = m_Scope->GetBioseqHandle(feat.GetLocation());
    if (!prot) {
        return;
    }

    // set protein feature partial to match coding region partial
    const CSeq_feat* cds = sequence::GetCDSForProduct(*(prot.GetCompleteBioseq()), m_Scope);
    if (!cds) {
        return;
    }

    bool cds_partial5 = cds->GetLocation().IsPartialStart(eExtreme_Biological);
    bool cds_partial3 = cds->GetLocation().IsPartialStop(eExtreme_Biological);
    bool cds_partial_feat = cds->IsSetPartial() ? cds->GetPartial() : false;
    bool prot_partial = feat.IsSetPartial() && feat.GetPartial();
    bool new_prot_partial = cds_partial5 || cds_partial3 || cds_partial_feat;

    if (cds_partial5 != feat.GetLocation().IsPartialStart(eExtreme_Biological)) {
        feat.SetLocation().SetPartialStart(cds_partial5, eExtreme_Biological);
        ChangeMade(CCleanupChange::eChangePartial);
    }
    if (cds_partial3 != feat.GetLocation().IsPartialStop(eExtreme_Biological)) {
        feat.SetLocation().SetPartialStop(cds_partial3, eExtreme_Biological);
        ChangeMade(CCleanupChange::eChangePartial);
    }
    if (prot_partial != new_prot_partial) {
        feat.SetPartial(new_prot_partial);
        ChangeMade(CCleanupChange::eChangePartial);
    }
}


void CNewCleanup_imp::x_SetPartialsForProtein(CBioseq& seq, bool partial5, bool partial3, bool feat_partial)
{
    CMolInfo::TCompleteness desired = GetCompletenessFromFlags(partial5, partial3, partial5 || partial3 || feat_partial);

    bool found = false;
    bool changed = false;
    if (seq.IsSetDescr()) {
        auto& dset = seq.SetDescr().Set();
        for (auto it : dset) {
            if (it->IsMolinfo()) {
                if (it->GetMolinfo().IsSetCompleteness()) {
                    if (it->GetMolinfo().GetCompleteness() != desired) {
                        it->SetMolinfo().SetCompleteness(desired);
                        ChangeMade(CCleanupChange::eChangeMolInfo);
                        changed = true;
                    }
                } else if (desired != CMolInfo::eCompleteness_unknown && desired != CMolInfo::eCompleteness_complete) {
                    it->SetMolinfo().SetCompleteness(desired);
                    ChangeMade(CCleanupChange::eChangeMolInfo);
                    changed = true;
                }
                found = true;
            }
        }
    }
    if (!found) {
        // no molinfo descriptor found, need to make new one
        CRef<CSeqdesc> new_desc(new CSeqdesc());
        new_desc->SetMolinfo().SetBiomol(CMolInfo::eBiomol_peptide);
        if (partial5 || partial3) {
            new_desc->SetMolinfo().SetCompleteness(desired);
        }
        seq.SetDescr().Set().push_back(new_desc);
        ChangeMade(CCleanupChange::eAddDescriptor);
        changed = true;
    }

    if (changed) {
        x_AddPartialToProteinTitle(seq);
    }
}


// part of ExtendedCleanup
void CNewCleanup_imp::ResynchPeptidePartials (
    CBioseq& seq
)

{
    if (!seq.IsSetInst() || !seq.GetInst().IsSetMol() || !seq.IsAa()) {
        return;
    }
    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);

    // need to find best protein feature
    SAnnotSelector sel( CSeqFeatData::eSubtype_prot );
    CFeat_CI feat_ci( bsh, sel );
    if (!feat_ci) {
        // no protein feature;
        return;
    }
    if (feat_ci->GetData().GetProt().IsSetProcessed() && 
        feat_ci->GetData().GetProt().GetProcessed() != CProt_ref::eProcessed_not_set) {
        // not a "real" protein feature
        return;
    }

    bool partial5 = feat_ci->GetLocation().IsPartialStart(eExtreme_Biological);
    bool partial3 = feat_ci->GetLocation().IsPartialStop(eExtreme_Biological);
    bool feat_partial = feat_ci->IsSetPartial() ? feat_ci->GetPartial() : false;
    x_SetPartialsForProtein(seq, partial5, partial3, feat_partial);
}


// Helper for removing non-matching title descriptors
struct STitleMatchString
{
    const string& m_Val;
    bool operator()(CRef<CSeqdesc> desc)
    {
        return (desc->IsTitle() && !NStr::Equal(desc->GetTitle(), m_Val));
    }
};

void CNewCleanup_imp::RemoveBadProteinTitle(CBioseq& seq)
{
    if (!seq.IsSetInst() || !seq.GetInst().IsSetMol() || !seq.IsAa()) {
        return;
    }

    // determine if sequence has title - if not, nothing to do here
    if (!seq.IsSetDescr()) {
        return;
    }

    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
    // only remove if seq is in nuc-prot set
    // and coding region for product sequence is available
    CBioseq_set_Handle parent = bsh.GetParentBioseq_set();
    if (!parent || !parent.IsSetClass() || parent.GetClass() != CBioseq_set::eClass_nuc_prot) {
        return;
    }

    string new_defline = sequence::CDeflineGenerator().GenerateDefline(bsh, sequence::CDeflineGenerator::fIgnoreExisting);
    auto& dset = seq.SetDescr().Set();
    size_t orig = dset.size();
    STitleMatchString matcher{ new_defline };
    dset.erase(std::remove_if(dset.begin(), dset.end(), matcher), dset.end());
    if (dset.size() != orig) {
        ChangeMade(CCleanupChange::eRemoveDescriptor);
    } 
}


// part of ExtendedCleanup
void CNewCleanup_imp::MoveCitationQuals(CBioseq& seq)
{
    vector<CConstRef<CPub> > pub_list;
    bool listed_pubs = false;

    CBioseq_Handle bsh = m_Scope->GetBioseqHandle(seq);
    CFeat_CI f(bsh);
    while (f) {
        if (f->IsSetQual()) {
            bool has_citation = false;
            ITERATE(CSeq_feat::TQual, it, f->GetQual()) {
                if ((*it)->IsSetQual() && NStr::Equal((*it)->GetQual(), "citation")) {
                    has_citation = true;
                    break;
                }
            }
            if (has_citation) {
                CRef<CSeq_feat> new_feat(new CSeq_feat());
                new_feat->Assign(*(f->GetSeq_feat()));
                auto& newqual = new_feat->SetQual();
                CSeq_feat::TQual::iterator it = newqual.begin();
                while (it != newqual.end()) {
                    bool do_remove = false;
                    if ((*it)->IsSetQual() && NStr::Equal((*it)->GetQual(), "citation")) {
                        if (!(*it)->IsSetVal() || !s_IsAllDigits((*it)->GetVal())) {
                            // just delete
                            do_remove = true;
                        } else {
                            // list pubs if we haven't already
                            if (!listed_pubs) {
                                pub_list = CCleanup::GetCitationList(bsh);
                                listed_pubs = true;
                            }
                            // create appropriate Cit
                            size_t num = NStr::StringToNonNegativeInt((*it)->GetVal());
                            if (num < pub_list.size()) {
                                CRef<CPub> cp(new CPub());
                                cp->Assign(*(pub_list[num]));
                                new_feat->SetCit().SetPub().push_back(cp);
                            }
                            do_remove = true;
                        }
                    }
                    if (do_remove) {
                        it = newqual.erase(it);
                    } else {
                        ++it;
                    }
                }
                if (new_feat->GetQual().empty()) {
                    new_feat->ResetQual();
                }

                CSeq_feat_EditHandle eh(f->GetSeq_feat_Handle());
                eh.Replace(*new_feat);
            }
        }
        ++f;
    }

}


void CNewCleanup_imp::x_RemoveUnseenTitles(CBioseq& seq)
{
    CBioseq_Handle b = m_Scope->GetBioseqHandle(seq);
    CBioseq_EditHandle beh(b);
    if (CCleanup::RemoveUnseenTitles(beh)) {
        ChangeMade(CCleanupChange::eRemoveDescriptor);
    }
}


void CNewCleanup_imp::x_RemoveUnseenTitles(CBioseq_set& set)
{
    CBioseq_set_Handle bh = m_Scope->GetBioseq_setHandle(set);
    CBioseq_set_EditHandle beh(bh);
    if (CCleanup::RemoveUnseenTitles(beh)) {
        ChangeMade(CCleanupChange::eRemoveDescriptor);
    }
}


struct SLaterDate {
    const CDate& m_Date;
    CSeqdesc::E_Choice date_type;

    bool operator()(CRef<CSeqdesc> desc) {
        if (desc->Which() != date_type) {
            return false;
        }
        CDate::ECompare compare;
        if (date_type == CSeqdesc::e_Create_date) {
            compare = m_Date.Compare(desc->GetCreate_date());
        } else {
            compare = m_Date.Compare(desc->GetUpdate_date());
        }
        return (compare != CDate::eCompare_same);
    }
};


struct SIsDate{
    CSeqdesc::E_Choice date_type;

    bool operator()(CRef<CSeqdesc> desc) {
        return (desc->Which() == date_type);
    }
};


void RemoveDatesAfterFirst(CSeq_descr& seq_descr, CSeqdesc::E_Choice date_type)
{
    auto& dset = seq_descr.Set();
    auto it = dset.begin();
    while (it != dset.end() && (*it)->Which() != date_type) {
        ++it;
    }
    if (it == dset.end()) {
        return;
    }
    ++it;
    SIsDate matcher{ date_type };
    dset.erase(std::remove_if(it, dset.end(), matcher), dset.end());
}


bool RemoveEarlierDates(CSeq_descr & seq_descr, CSeqdesc::E_Choice date_type)
{
    auto& dset = seq_descr.Set();
    CConstRef<CDate> latest_date;
    size_t num_present = 0;
    // find latest item
    for (auto it : dset) {
        if (it->Which() == date_type) {
            CConstRef<CDate> this_date;
            if (date_type == CSeqdesc::e_Create_date) {
                this_date.Reset(&(it->GetCreate_date()));
            }
            else {
                this_date.Reset(&(it->GetUpdate_date()));
            }

            if (!latest_date || latest_date->Compare(*this_date) == CDate::eCompare_before) {
                latest_date = this_date;
            }
            ++num_present;
        }
    }
    if (num_present < 2) {
        // nothing to do here
        return false;
    }

    SLaterDate matcher{ *latest_date, date_type };
    dset.erase(std::remove_if(dset.begin(), dset.end(), matcher), dset.end());

    RemoveDatesAfterFirst(seq_descr, date_type);

    return true;
}


void CNewCleanup_imp::KeepLatestDateDesc(CSeq_descr & seq_descr)
{
    if (RemoveEarlierDates(seq_descr, CSeqdesc::e_Create_date)) {
        ChangeMade(CCleanupChange::eRemoveDescriptor);
    }
    if (RemoveEarlierDates(seq_descr, CSeqdesc::e_Update_date)) {
        ChangeMade(CCleanupChange::eRemoveDescriptor);
    }
}


void CNewCleanup_imp::x_SingleSeqSetToSeq(CBioseq_set& set)
{
    if (set.IsSetSeq_set() && set.GetSeq_set().size() == 1 &&
        set.GetSeq_set().front()->IsSeq() &&
        set.IsSetClass() && set.GetClass() == CBioseq_set::eClass_genbank) {
        CBioseq_set_Handle bh = m_Scope->GetBioseq_setHandle(set);
        CSeq_entry_Handle seh = bh.GetParentEntry();
        CSeq_entry_EditHandle eh(seh);
        // This call will remove annots/descrs from the 
        // set and attach them to the seq.
        eh.ConvertSetToSeq();
    }
}


void CNewCleanup_imp::x_MergeDupBioSources(CSeq_descr & seq_descr)
{
    if (CCleanup::MergeDupBioSources(seq_descr)) {
        ChangeMade(CCleanupChange::eRemoveDupBioSource);
    }
}


void CNewCleanup_imp::x_ExtendedCleanupExtra(CSeq_entry_Handle seh)
{
    // This is for global changes
#if 0
    if (CCleanup::NormalizeGeneQuals(seh)) {
        ChangeMade(CCleanupChange::eChangeGeneRef);
        ChangeMade(CCleanupChange::eRemoveQualifier);
    }
#endif
    if (CCleanup::FixGeneXrefSkew(seh)) {
        ChangeMade(CCleanupChange::eChangeGeneRef);
    }
    if (CCleanup::MoveProteinSpecificFeats(seh)) {
        ChangeMade(CCleanupChange::eMoveFeat);
    }
    if (CCleanup::ConvertPubFeatsToPubDescs(seh)) {
        ChangeMade(CCleanupChange::eAddDescriptor);
        ChangeMade(CCleanupChange::eRemoveFeat);
    }
    if (CCleanup::RescueSiteRefPubs(seh)) {
        ChangeMade(CCleanupChange::eAddDescriptor);
        ChangeMade(CCleanupChange::eRemoveFeat);
    }
    if (CCleanup::ConvertSrcFeatsToSrcDescs(seh)) {
        ChangeMade(CCleanupChange::eAddDescriptor);
        ChangeMade(CCleanupChange::eRemoveFeat);
    }
    if (CCleanup::RenormalizeNucProtSets(seh)) {
        ChangeMade(CCleanupChange::eCollapseSet);
    }
    if (CCleanup::RepairXrefs(seh)) {
        ChangeMade(CCleanupChange::eAddSeqFeatXref);
    }
    if (CCleanup::RepackageProteins(seh)) {
        ChangeMade(CCleanupChange::eChangeOther);
    }
    // as requested in RW-726, uniquify feature IDs
    map<CSeq_feat_Handle, CRef<CSeq_feat> > changed_feats;
    CFixFeatureId::s_ApplyToSeqInSet(seh, changed_feats);
    for (auto &fh_feat : changed_feats)
    {
        auto orig_feat = fh_feat.first;
        auto new_feat = fh_feat.second;
        CSeq_feat_EditHandle feh(orig_feat);
        feh.Replace(*new_feat);
    }
}


void CNewCleanup_imp::ExtendedCleanupSeqEntry (
    CSeq_entry& seq_entry
)

{
    // extended cleanup includes basic cleanup
    BasicCleanupSeqEntry( seq_entry );
    if( ! (m_Options & CCleanup::eClean_NoNcbiUserObjects) ) {
        x_AddNcbiCleanupObject(seq_entry);
    }

    CAutogeneratedExtendedCleanup auto_ext_cleanup( *m_Scope, *this );
    auto_ext_cleanup.ExtendedCleanupSeqEntry( seq_entry );
    
    CSeq_entry_Handle seh = m_Scope->GetSeq_entryHandle(seq_entry);
    x_ExtendedCleanupExtra(seh);
    // TODO: implement more of ExtendedCleanup
}

//LCOV_EXCL_START
//not used by asn_cleanup because we clean the submit block separately
//and use read hooks for the seq-entries
void CNewCleanup_imp::ExtendedCleanupSeqSubmit (
    CSeq_submit& ss
)
{
    // extended cleanup includes basic cleanup
    BasicCleanupSeqSubmit( ss );

    if( ! (m_Options & CCleanup::eClean_NoNcbiUserObjects) && ss.IsEntrys() ) {
        auto& entrys = ss.SetData().SetEntrys();
        for (auto it : entrys) {
            x_AddNcbiCleanupObject(*it);
        }
    }

    CAutogeneratedExtendedCleanup auto_ext_cleanup( *m_Scope, *this );
    auto_ext_cleanup.ExtendedCleanupSeqSubmit( ss );
    if (ss.IsSetData() && ss.GetData().IsEntrys()) {
        auto& entrys = ss.SetData().SetEntrys();
        for (auto it : entrys) {
            CSeq_entry_Handle seh = m_Scope->GetSeq_entryHandle(*it);
            x_ExtendedCleanupExtra(seh);
        }
    }

    // TODO: implement more of ExtendedCleanup
}
//LCOV_EXCL_STOP

void CNewCleanup_imp::ExtendedCleanupSeqAnnot (
    CSeq_annot& sa
)

{
    // extended cleanup includes basic cleanup
    BasicCleanupSeqAnnot( sa );

    CAutogeneratedExtendedCleanup auto_ext_cleanup( *m_Scope, *this );
    auto_ext_cleanup.ExtendedCleanupSeqAnnot( sa );

    // TODO: implement more of ExtendedCleanup
}

//LCOV_EXCL_START
//not used by asn_cleanup because we clean the submit block separately
//and then clean the seq-entries separately
void CNewCleanup_imp::SetGlobalFlags(const CSeq_submit& ss)
{
    ResetGlobalFlags();
    if (ss.IsEntrys()) {
        const auto& entries = ss.GetData().GetEntrys();
        for (auto it : entries) {
            SetGlobalFlags((*it), false);
        }
    }
}
//LCOV_EXCL_STOP




void CNewCleanup_imp::SetGlobalFlags(const CSeq_entry& se, bool reset)
{
    if (reset) {
        ResetGlobalFlags();
    }

    if (se.IsSeq()) {
        SetGlobalFlags(se.GetSeq(), false);
    } else if (se.IsSet()) {
        SetGlobalFlags(se.GetSet(), false);
    }
}


void CNewCleanup_imp::SetGlobalFlags(const CBioseq_set& set, bool reset)
{
    if (reset) {
        ResetGlobalFlags();
    }
    if (set.IsSetSeq_set()) {
        ITERATE(CBioseq_set::TSeq_set, it, set.GetSeq_set()) {
            SetGlobalFlags(**it, false);
        }
    }
}


void CNewCleanup_imp::SetGlobalFlags(const CBioseq& bs, bool reset)
{
    if (reset) {
        ResetGlobalFlags();
    }

    if (!CCleanup::ShouldStripPubSerial(bs)) {
        m_StripSerial = false;
    }

    ITERATE(CBioseq::TId, id, bs.GetId()) {
        const CSeq_id& sid = **id;
        switch (sid.Which()) {
            case NCBI_SEQID(Embl):
            case NCBI_SEQID(Ddbj):
                m_IsEmblOrDdbj = true;
                break;
            default:
                break;
        }
    }
}


void CNewCleanup_imp::SubmitblockBC(CSubmit_block& sb)
{
    if (sb.IsSetCit() && sb.GetCit().IsSetAuthors()) {
        x_AuthListBCWithFixInitials(sb.SetCit().SetAuthors());
    }
}

void CNewCleanup_imp::ExtendedCleanupSeqEntryHandle (
        CSeq_entry_Handle& seh )
{
    auto* pEntry  = const_cast<CSeq_entry*>(seh.GetCompleteSeq_entry().GetPointer());
    SetScope(seh.GetScope());
    ExtendedCleanupSeqEntry(*pEntry);
}

END_SCOPE(objects)
END_NCBI_SCOPE

