forked from DaehwanKimLab/centrifuge
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcentrifuge_report.cpp
186 lines (153 loc) · 5.66 KB
/
centrifuge_report.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/*
* centrifuge-build.cpp
*
* Created on: Apr 8, 2015
* Author: fbreitwieser
*/
#include<iostream>
#include<fstream>
#include<sstream>
#include<map>
#include<vector>
#include "assert_helpers.h"
#include "sstring.h"
#include "ds.h" // EList
#include "bt2_idx.h" // Ebwt
#include "bt2_io.h"
#include "util.h"
using namespace std;
typedef TIndexOffU index_t;
static bool startVerbose = true; // be talkative at startup
int gVerbose = 1; // be talkative always
static const char *argv0 = NULL;
static string adjIdxBase;
static bool useShmem = false; // use shared memory to hold the index
static bool useMm = false; // use memory-mapped files to hold the index
static bool mmSweep = false; // sweep through memory-mapped files immediately after mapping
static int offRate = -1; // keep default offRate
static bool noRefNames = false; // true -> print reference indexes; not names
static int sanityCheck = 0; // enable expensive sanity checks
/**
* Print a summary usage message to the provided output stream.
*/
static void printUsage(ostream& out) {
out << "Centrifuge version " << string(CENTRIFUGE_VERSION).c_str() << " by Daehwan Kim ([email protected], www.ccb.jhu.edu/people/infphilo)" << endl;
string tool_name = "centrifuge-class";
out << "Usage: " << endl
<< " " << tool_name.c_str() << " <bt2-idx> <centrifuge-out>" << endl
<< endl
<< " <bt2-idx> Index filename prefix (minus trailing .X." << gEbwt_ext << ")." << endl
<< " <centrifuge-out> Centrifuge result file." << endl;
}
template <typename T>
class Pair2ndComparator{
public:
bool operator()(const pair<T,T> &left, const pair<T,T> &right){
return left.second < right.second;
}
};
template<typename TStr>
static void driver(
const char * type,
const string& bt2indexBase,
const string& cf_out)
{
if(gVerbose || startVerbose) {
cerr << "Entered driver(): "; logTime(cerr, true);
}
//initializeCntLut(); // FB: test commenting
// Vector of the reference sequences; used for sanity-checking
EList<SString<char> > names, os;
EList<size_t> nameLens, seqLens;
// Initialize Ebwt object and read in header
if(gVerbose || startVerbose) {
cerr << "About to initialize fw Ebwt: "; logTime(cerr, true);
}
adjIdxBase = adjustEbwtBase(argv0, bt2indexBase, gVerbose);
Ebwt<index_t> ebwt(
adjIdxBase,
0, // index is colorspace
-1, // fw index
true, // index is for the forward direction
/* overriding: */ offRate,
0, // amount to add to index offrate or <= 0 to do nothing
useMm, // whether to use memory-mapped files
useShmem, // whether to use shared memory
mmSweep, // sweep memory-mapped files
!noRefNames, // load names?
true, // load SA sample?
true, // load ftab?
true, // load rstarts?
gVerbose, // whether to be talkative
startVerbose, // talkative during initialization
false /*passMemExc*/,
sanityCheck);
//Ebwt<index_t>* ebwtBw = NULL;
EList<size_t> reflens;
EList<string> refnames;
readEbwtRefnames<index_t>(adjIdxBase, refnames);
map<uint32_t,pair<string,uint64_t> > speciesID_to_name_len;
for(size_t i = 0; i < ebwt.nPat(); i++) {
// cerr << "Push back to reflens: "<< refnames[i] << " is so long: " << ebwt.plen()[i] << endl;
reflens.push_back(ebwt.plen()[i]);
// extract numeric id from refName
const string& refName = refnames[i];
uint64_t id = extractIDFromRefName(refName);
uint32_t speciesID = (uint32_t)(id >> 32);
// extract name from refName
const string& name_part = refName.substr(refName.find_first_of(' '));
//uint32_t genusID = (uint32_t)(id & 0xffffffff);
speciesID_to_name_len[speciesID] = pair<string,uint64_t>(name_part,ebwt.plen()[i]);
}
// EList<string> refnames;
// readEbwtRefnames<index_t>(adjIdxBase, refnames);
// Read Centrifuge output file
ifstream infile(cf_out.c_str());
string line;
map<uint32_t,uint32_t> species_to_score;
while (getline(infile,line)) {
string rd_name;
uint32_t genusID;
uint32_t speciesID;
uint32_t score;
uint32_t secbest_score;
istringstream iss(line);
iss >> rd_name >> genusID >> speciesID >> score >> secbest_score;
// cerr << rd_name << " -> " << genusID << " -> " << speciesID << " -> " << score << " -> " << secbest_score << "\n";
species_to_score[speciesID] += score;
}
// Sort the species by their score
vector<pair<uint32_t,uint32_t> > species_to_score_v(species_to_score.begin(), species_to_score.end());
sort(species_to_score_v.begin(),species_to_score_v.end(),Pair2ndComparator<uint32_t>());
cout << "Name\tTaxonID\tLength\tSummed Score\tNormalized Score\n";
// Output the summed species scores
for (vector<pair<uint32_t,uint32_t> >::iterator species_score = species_to_score_v.begin();
species_score != species_to_score_v.end();
++species_score) {
uint32_t speciesID = species_score->first;
pair<string,uint64_t> name_len = speciesID_to_name_len[speciesID];
uint64_t slength = name_len.second;
uint64_t sumscore = species_score->second;
cout << name_len.first << "\t" <<
speciesID << "\t" <<
slength << "\t" <<
sumscore << "\t" <<
(float)sumscore/slength << "\n";
}
}
//int centrifuge_report(int argc, const char **argv) {
int main(int argc, const char **argv) {
if (argc < 3) {
cerr << "Number of arguments is " << argc << endl;
printUsage(cerr);
exit(1);
}
argv0 = argv[0];
const string bt2index = argv[1];
const string cf_out = argv[2];
//static string outfile; // write SAM output to this file
cout << "Input bt2 file: \"" << bt2index.c_str() << "\"" << endl;
cout << "Centrifuge results file: \"" << cf_out.c_str() << "\"" << endl;
driver<SString<char> >("DNA", bt2index, cf_out);
return 0;
}