diff --git a/src/msta_scores.cpp b/src/msta_scores.cpp index a2a5b8d..e3389cd 100644 --- a/src/msta_scores.cpp +++ b/src/msta_scores.cpp @@ -21,11 +21,11 @@ void cmd_msta_scores() const bool DoCore = opt_core; - string LabelDir; - if (optset_labeldir) + string SeqsDir; + if (optset_seqsdir) { - LabelDir = string(opt_labeldir); - Dirize(LabelDir); + SeqsDir = string(opt_seqsdir); + Dirize(SeqsDir); } const uint N = SIZE(Accs); @@ -45,18 +45,12 @@ void cmd_msta_scores() } SeqDB MSA; - if (optset_labeldir) + if (optset_seqsdir) { - set EvalLabels; - vector EvalLabelVec; - string LabelFN = LabelDir + Acc; - ReadLinesFromFile(LabelFN, EvalLabelVec); - uint N = SIZE(EvalLabelVec); - if (N == 0) - Die("Empty -labels file"); - for (uint i = 0; i < N; ++i) - EvalLabels.insert(EvalLabelVec[i]); - MSA.FromFasta_Labels(FN, EvalLabels, true); + string SeqsFN = SeqsDir + Acc; + SeqDB EvalSeqs; + EvalSeqs.FromFasta(SeqsFN, false); + MSA.FromFasta_Seqs(FN, EvalSeqs, true); } else MSA.FromFasta(FN, true); diff --git a/src/myopts.h b/src/myopts.h index ca99ba8..b68143b 100644 --- a/src/myopts.h +++ b/src/myopts.h @@ -59,7 +59,7 @@ STR_OPT(scoredist) STR_OPT(ref) STR_OPT(label) STR_OPT(labels) -STR_OPT(labeldir) +STR_OPT(seqsdir) STR_OPT(fasta) STR_OPT(feature_fasta) STR_OPT(alnout) diff --git a/src/seqdb.cpp b/src/seqdb.cpp index 5c938e9..e17bbf3 100644 --- a/src/seqdb.cpp +++ b/src/seqdb.cpp @@ -163,36 +163,49 @@ void SeqDB::SetIsNucleo() m_IsNucleoSet = true; } -void SeqDB::FromFasta_Labels(const string &FileName, - const set &Labels, bool AllowGaps) +void SeqDB::FromFasta_Seqs(const string &FileName, + const SeqDB &EvalSeqs, bool AllowGaps) { SFasta SF; SF.Open(FileName); SF.m_AllowGaps = AllowGaps; uint FoundCount = 0; m_IsAligned = false; + set EvalSeqSet; + const uint EvalSeqCount = EvalSeqs.GetSeqCount(); + for (uint i = 0; i < EvalSeqCount; ++i) + EvalSeqSet.insert(EvalSeqs.GetSeq(i)); + for (;;) { const char* Seq = SF.GetNextSeq(); if (Seq == 0) break; - const string Label = SF.GetLabel(); - if (Labels.find(Label) == Labels.end()) - continue; - ++FoundCount; const unsigned L = SF.GetSeqLength(); if (L == 0) continue; + const string Label = SF.GetLabel(); + + string s2; + for (uint i = 0; i < L; ++i) + { + char c = Seq[i]; + if (!isgap(c)) + s2 += toupper(c); + } + if (EvalSeqSet.find(s2) == EvalSeqSet.end()) + continue; + + ++FoundCount; string s; for (unsigned i = 0; i < L; ++i) s.push_back(Seq[i]); AddSeq(Label, s); } - const uint LabelCount = SIZE(Labels); if (FoundCount == 0) Die("No labels found"); - if (FoundCount < LabelCount) - Warning("%u / %u labels not found", LabelCount - FoundCount, LabelCount); + if (FoundCount < EvalSeqCount) + Warning("%u / %u labels not found", EvalSeqCount - FoundCount, EvalSeqCount); } void SeqDB::FromFasta(const string& FileName, bool AllowGaps) diff --git a/src/seqdb.h b/src/seqdb.h index 7c55264..f9b37f6 100644 --- a/src/seqdb.h +++ b/src/seqdb.h @@ -42,7 +42,7 @@ class SeqDB bool GetIsNucleo(); unsigned GetSeqCount() const { return SIZE(m_Seqs); } void FromFasta(const string &FileName, bool AllowGaps = false); - void FromFasta_Labels(const string &FileName, const set &Labels, bool AllowGaps = false); + void FromFasta_Seqs(const string &FileName, const SeqDB &Seqs, bool AllowGaps = false); void WritePretty(FILE *f) const; void LogMe() const;