-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathnuc_sampler.cpp
336 lines (307 loc) · 12.4 KB
/
nuc_sampler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
// nuc_sampler - Estimates the composition of (poly-)nucleotides in a
// sequence sample
// @author Luis M. Rodriguez-R <lmrodriguezr at gmail dot com>
// @license artistic 2.0
// @version 2.0
#define _MULTI_THREADED
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <vector>
#include "enveomics/universal.h"
#include "enveomics/sequence.h"
//#define DEBUG(a) (cerr << "(LINE " << a << ")" << endl)
#define DEBUG(a) (a)
#define LARGEST_LINE 2048
#define LARGEST_PATH 2048
#define OMAG_STEP 1024
#define VERSION "1.3.0"
using namespace std;
typedef struct {
char *kmer; // <- The polynucleotide
unsigned int label; // <- A numeric representation of the kmer
// Total count is calculated as: base*omag + hang
unsigned int base; // <- The base in the above expression
unsigned int omag; // <- The omag in the above expression
unsigned int hang; // <- The hang in the above expression
} count_t;
int V=0;
char nuc_char[] = {
'N', 'A', 'C', 'G', 'T', 'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', '.'
};
char hex_char[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
};
void help(const char *msg) {
if (msg != NULL && strlen(msg) != 0) cerr << endl << msg << endl << endl;
cerr
<< "DESCRIPTION" << endl
<<" Calculates or approximates the composition of nucleotides or" << endl
<<" polynucleotides in large datasets" << endl
<< endl
<<"USAGE" << endl
<<" nuc_sampler -s sequences.fa [options]" << endl
<<" nuc_sampler -h" << endl
<<" nuc_sampler -V" << endl
<< endl
<<"MANDATORY ARGUMENTS" << endl
<<" -s <str> : Path to the (input) file containing the sequences," << endl
<<" which can be gzipped with .gz extension" << endl
<< endl
<<"ADDITIONAL OPTIONS" << endl
<<" -o <str> : Path to the output file where results will be" << endl
<<" saved. By default the results are sent to stdout." << endl
<<" This is the same behavior as using a dash (-)." << endl
<<" If an empty string is provided, does not produce" << endl
<<" any output" << endl
<<" -e : Produce extended output" << endl
<<" -f <str> : The format of the sequences. Can be 'fasta' or" << endl
<<" 'fastq'. By default: 'fasta'" << endl
<<" -k <int> : Size of the polynucleotides (words) to count." << endl
<<" By default: 1" << endl
<<" -r <int> : Random generator seed. By default current time" << endl
<<" -v <int> : Verbose, for debugging purposes. By default 0" << endl
<<" -x <num> : Probability of taking a sequence into account," << endl
<<" regardless of the sequence length. Lower values" << endl
<<" reduce accuracy but increase speed. Any value" << endl
<<" lower than 1 produces and approximation of the" << endl
<<" relative composition, since reported counts only" << endl
<<" include observed polynucleotides. By default 1" << endl
<<" -h : Display this message and exit" << endl
<<" -V : Show version information and exit" << endl
<<" -z : Include zero-count polynucleotides in the output" << endl
<< endl
<<"INPUT" << endl
<<" Sequences must be in FastA or FastQ format" << endl
<< endl
<<"OUTPUT" << endl
<<" - The output is a tab-separated table containing the columns:" << endl
<<" 1. kmer: the (poly-)nucleotide string" << endl
<<" 2. count: the total count, which can be an approximation" << endl
<<" in scientific notation for large numbers" << endl
<< endl
<<" - The extended output is intended to provide higher precision" << endl
<<" for large counts, and contains the following additional" << endl
<<" columns (using -e):" << endl
<<" 3. base: count expressed as times of order of magnitude" << endl
<<" 4. order of magnitude: units of base" << endl
<<" 5. extra counts: excess counts from base" << endl
<< endl
<<" The actual count can be calculated more accurately as the" << endl
<<" product of base (fourth column) and order of magnitude" << endl
<<" (fourth column), plus the extra counts (fifth column)." << endl
<< endl
<<" In this compilation, the value of order of magnitude is a" << endl
<<" power of " << OMAG_STEP << endl
<< endl;
exit(1);
}
void kmer2label(unsigned int &label, char *kmer, unsigned int length) {
// Vars
char hex[length+1]; // char representation of a hexadecimal integer
// label
for (unsigned int i = 0; i < length; i++) {
hex[i] = 'f'; // <- Any other weird thing
for (int j = 0; j < 16; j++)
if (kmer[i] == nuc_char[j]) hex[i] = hex_char[j];
}
hex[length] = (char)NULL;
sscanf(hex, "%x", &label); // hex ---> label
}
void label2kmer(char *&kmer, unsigned int label, unsigned int length) {
// Vars
char *hex; // char representation of a hexadecimal integer
int lead_zeroes;
// label
hex = new char[length + 1];
snprintf(hex, length + 1, "%x", label); // hex <--- label
lead_zeroes = length - strlen(hex);
for (unsigned int i = 0; i < lead_zeroes; i++) kmer[i] = nuc_char[0];
for (unsigned int i = lead_zeroes; i < length; i++) {
for (int j = 0; j < 16; j++)
if(hex[i-lead_zeroes] == hex_char[j]) kmer[i] = nuc_char[j];
}
kmer[length] = (char)NULL;
}
void count_plus(count_t &count, unsigned int howmany){
// ++hang
count.hang += howmany;
// ++base
if (count.hang >= count.omag) {
count.base += (unsigned int) count.hang / count.omag;
count.hang = count.hang % count.omag;
}
// ++omag
if (count.base >= UINT_MAX / OMAG_STEP) {
count.omag *= OMAG_STEP;
count.hang += count.base % count.omag;
count.base /= OMAG_STEP;
}
}
unsigned int count_polynucleotides(
count_t *&counts, char *seqFile, unsigned int k, float heur) {
// Vars
unsigned int labels_no, linelen, label;
ifstream fileh;
char *line, *kmer;
float rn;
bool use = true;
// Kmers and counts initialization
labels_no = (unsigned int)pow(16, k);
if (V >= 4) fprintf(
stderr, "Creating unique integer ids for %u kmers\n", labels_no
);
counts = (count_t *)malloc(labels_no * (sizeof *counts));
if (!counts) error(
"Impossible to allocate memory for that many different kmers", labels_no
);
for (unsigned int a = 0; a < labels_no; a++) {
counts[a].kmer = (char *)malloc(k + 1);
if (!counts[a].kmer)
error("Insufficient memory to represent the next kmer", a);
label2kmer(counts[a].kmer, a, k);
counts[a].label = a;
counts[a].base = 0;
counts[a].hang = 0;
counts[a].omag = 1;
if (V >= 5)
fprintf(stderr, "Kmer %s -> %u\n", counts[a].kmer, counts[a].label);
}
// Allocate char* memory
if (V >= 5)
fprintf(stderr, "Allocating %u bits\n", CHAR_BIT*(LARGEST_LINE + 1));
line = (char *)malloc(LARGEST_LINE + 1);
if (!line)
error("Impossible to allocate one temporal line in memory", LARGEST_LINE);
if (V >= 5)
fprintf(stderr, "Allocating %u bits\n", CHAR_BIT * (k + 1));
kmer = (char *)malloc(k + 1);
if (!kmer) error("Impossible to allocate one temporal kmer in memory", k);
kmer[k] = (char)NULL;
// Read the file
if (V >= 5) cerr << "Opening file: " << seqFile << endl;
fileh.open(seqFile, ios::in);
if (!fileh.is_open()) error("Error reading file", seqFile);
while (!fileh.eof()) {
fileh.getline(line, LARGEST_LINE);
if (line[0] == '>') {
if (heur != 1.0) {
rn = static_cast <float> (rand()) / static_cast <float> (RAND_MAX);
use = rn < heur;
}
} else {
if (!use) goto next_line;
linelen = strlen(line);
if (linelen < k + 1) goto next_line;
for (int i = 0; i < linelen - k + 1; i++) {
memmove(kmer, line+i, k);
kmer2label(label, kmer, k);
count_plus(counts[label], 1);
}
}
next_line:;
}
fileh.close();
return labels_no;
}
void report(
count_t *&counts, unsigned int length, char *outfile,
bool extended, bool zeroes) {
ofstream outfs;
char *sep = (char *)"\t", *text1, *text2;
bool std_out = false;
if (strlen(outfile) <= 0) return;
else if (strcmp(outfile, "-")==0) std_out = true;
else {
outfs.open(outfile, ios::out);
if (!outfs.is_open()) error("I can not write in the output file", outfile);
}
for (unsigned int i = 0; i < length; i++) {
if (!zeroes && counts[i].base == 0 && counts[i].hang == 0) continue;
text1 = new char[100];
snprintf(text1, 100, "%s%s%g",
counts[i].kmer, sep,
(double)counts[i].base*counts[i].omag+counts[i].hang);
if (std_out) cout << text1; else outfs << text1;
if (extended) {
text2 = new char[100];
snprintf(text2, 100, "%s%u%s%u%s%u", sep,
counts[i].base, sep,
counts[i].omag, sep,
counts[i].hang);
if (std_out) cout << text2; else outfs << text2;
}
if (std_out) cout << endl; else outfs << endl;
}
if (!std_out) outfs.close();
}
int main(int argc, char *argv[]) {
cout << "nuc_sampler v" << VERSION << endl;
if (argc <= 1) help("");
// Vars
char *inputfile = new char[LARGEST_PATH], *file = new char[LARGEST_PATH],
*format = (char *)"fasta", *outfile = (char *)"-", *namFile, *seqFile;
double heur = 1.0;
int rseed = time(NULL), largest_seq;
unsigned int k = 1, labels_no, N;
count_t *counts;
bool extended = false, remove_input = false, zeroes = false;
// GetOpt
int optchr;
while ((optchr = getopt (argc, argv, "ef:ho:r:s:v:Vk:x:z")) != EOF)
switch(optchr) {
case 'e': extended = true; break;
case 'f': format = optarg; break;
case 'h': help(""); break;
case 'k': k = atoi(optarg); break;
case 'o': outfile = optarg; break;
case 'r': rseed=atoi(optarg); break;
case 's': inputfile = optarg; break;
case 'v': V = atoi(optarg); break;
case 'V': return 0;
case 'x': heur = atof(optarg); break;
case 'z': zeroes = true; break;
}
// Initialize
if (strlen(inputfile) == 0) help("");
if (strcmp(format, "fasta") != 0 && strcmp(format, "fastq") != 0)
help("Unsupported value for -f option");
if (k<=0)
help("Bad argument for -k option: it must be a non-zero positive integer");
if (heur <= 0.0 || heur > 1.0)
help("Bad argument for -x option: it must be in the range (0, 1]");
if (outfile && (strlen(outfile) > 0) && (strcmp(outfile, "-") != 0))
remove(outfile);
srand(rseed);
// Check if gzipped
if (has_gz_ext(inputfile)) {
remove_input = true;
snprintf(file, LARGEST_PATH, "%s.enve-tmp.%d", inputfile, getpid());
gunz_file(inputfile, file);
} else {
snprintf(file, LARGEST_PATH, "%s", inputfile);
}
// Parse file
if (V) cerr << "Counting sequences" << endl;
N = build_index(file, format, namFile, seqFile, largest_seq);
if (largest_seq < 1)
error("Empty sequences or internal error. Largest sequence: ", largest_seq);
if (V >= 2)
cerr << "The file " << seqFile
<< " was just created with sequence-only data" << endl;
if (V >= 4) cerr << "Longest sequence is: " << largest_seq << endl;
if (N == 0)
error("The file you provided has no sequences. Delete the file", seqFile);
if (V) cerr << "Reading file with " << N << " sequences" << endl;
// Run counts
labels_no = count_polynucleotides(counts, seqFile, k, heur);
report(counts, labels_no, outfile, extended, zeroes);
// Cleanup
if (remove_input) remove(file);
remove(namFile);
remove(seqFile);
return 0;
}