-
Notifications
You must be signed in to change notification settings - Fork 118
/
Copy pathRefs.h
159 lines (122 loc) · 4.08 KB
/
Refs.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#ifndef REFS
#define REFS
#include<cstdio>
#include<cstring>
#include<cstdlib>
#include<cassert>
#include<string>
#include<fstream>
#include<vector>
#include "utils.h"
#include "RefSeq.h"
#include "RefSeqPolicy.h"
#include "PolyARules.h"
class Refs {
public:
Refs() {
M = 0;
seqs.clear();
has_polyA = false;
}
~Refs() {}
void makeRefs(char*, RefSeqPolicy&, PolyARules&);
void loadRefs(char*, int = 0);
void saveRefs(char*);
int getM() { return M; } // get number of isoforms
//int getNS() { return M + 1; } // get number of parameters, I do not think we need it here.
RefSeq& getRef(int sid) { return seqs[sid]; } // get a particular reference
std::vector<RefSeq>& getRefs() { return seqs; } // may be slow, for copying the whole thing
bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added
//lim : >=0 If mismatch > lim , return; -1 find all mismatches
int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) {
int nMis = 0; // number of mismatches
for (int i = 0; i < LEN; i++) {
char rc = toupper(readseq[i]);
if (seq[i + pos] == 'N' || rc == 'N' || seq[i + pos] != rc) nMis++;
// a speed up tech
if (lim >= 0 && nMis > lim) return nMis;
}
return nMis;
}
bool isValid(int sid, int dir, int pos, const std::string& readseq, int LEN, int C) {
if (sid <= 0 || sid > M || (dir != 0 && dir != 1) || pos < 0 || pos + LEN > seqs[sid].getTotLen() || LEN > (int)readseq.length()) return false;
const std::string& seq = seqs[sid].getSeq(dir);
return countMismatch(seq, pos, readseq, LEN, C) <= C;
}
// get segment from refs
std::string getSegment(int sid, int dir, int pos, int LEN) {
if (pos < 0 || pos + LEN > seqs[sid].getTotLen()) return "fail";
const std::string& seq = seqs[sid].getSeq(dir);
std::string seg = "";
for (int i = 0; i < LEN; i++)
seg.append(1, seq[pos + i]);
return seg;
}
private:
int M; // # of isoforms, id starts from 1
std::vector<RefSeq> seqs; // reference sequences, starts from 1; 0 is for noise gene
bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false
};
//inpF in fasta format
void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
//read standard fasta format here
std::ifstream fin;
std::string tag, line, rawseq;
seqs.clear();
seqs.push_back(RefSeq()); // noise isoform
M = 0;
has_polyA = false;
fin.open(inpF);
if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
getline(fin, line);
while ((fin) && (line[0] == '>')) {
tag = line.substr(1);
rawseq = "";
while((getline(fin, line)) && (line[0] != '>')) {
rawseq += line;
}
if (rawseq.size() <= 0) {
fprintf(stderr, "Warning: Fasta entry %s has an empty sequence! It is omitted!\n", tag.c_str());
continue;
}
++M;
seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen();
}
fin.close();
if (verbose) { printf("Refs.makeRefs finished!\n"); }
}
//inpF in fasta format, with sequence all in one line together
//option 0 read all, 1 do not read sequences
void Refs::loadRefs(char *inpF, int option) {
std::ifstream fin;
RefSeq seq;
fin.open(inpF);
if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
seqs.clear();
seqs.push_back(RefSeq());
M = 0;
has_polyA = false;
bool success;
do {
success = seq.read(fin, option);
if (success) {
seqs.push_back(seq);
++M;
has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen();
}
} while (success);
fin.close();
assert(M + 1 == (int)seqs.size());
if (verbose) { printf("Refs.loadRefs finished!\n"); }
}
void Refs::saveRefs(char* outF) {
std::ofstream fout;
fout.open(outF);
for (int i = 1; i <= M; i++) {
seqs[i].write(fout);
}
fout.close();
if (verbose) { printf("Refs.saveRefs finished!\n"); }
}
#endif