-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathplink_binary.h
276 lines (237 loc) · 9.11 KB
/
plink_binary.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
/*
* Copyright (c) 2011-2012 Genome Research Ltd. All rights reserved.
*
* This file is part of Gftools.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef GFTOOLS_PLINK_BIN
#define GFTOOLS_PLINK_BIN
#include <string>
#include <vector>
#include <map>
#include <set>
#include <fcntl.h>
#include <sys/mman.h>
#include "snp.h"
#include "individual.h"
#include "exceptions.h"
class plink_binary {
private:
bool open_for_write;
std::fstream *bed_file;
// for BED file
bool is_mem_mapped;
char *fmap;
size_t flen;
int fd;
unsigned int snp_ptr; // index to next snp to be read
unsigned int bytes_per_snp;
void read_bed_header();
void write_bed_header();
void bed_write(gftools::snp snp, std::vector<int> genotypes);
void bed_write(std::vector<int> genotypes);
void uncompress_calls(char *buffer, size_t len, std::vector<int> &genotypes);
void compress_calls(char *buffer, std::vector<int> genotypes);
void open_bed_write(std::string filename);
void open_bed_read(std::string filename, bool quell_mem_mapping);
void get_snp(size_t pos, std::vector<int> &genotypes);
void extract_bed(size_t pos, size_t len, char *buffer);
void init(std::string dataset, bool mode);
/** Collates genotype call strings to determine the alleles involved.
*
* Splits each call string to determine the alleles involved. Checks the
* number of alleles involved.
* @param g_str A vector of genotype call strings.
* @return A pair of allele strings, A and B.
*/
std::vector<std::string> collate_alleles(const std::vector<std::string> &g_str);
bool is_empty(std::ifstream &ifstream);
public:
/// The dataset name.
std::string dataset;
/// Mode of BED file (0 => individual major, 1 => snp major)
char bed_mode;
/// The character used to represent an allele where the genotype is uknown
char missing_genotype;
/// Prevent memory-mapping of the BED file, if true.
bool quell_mem_mapping;
/// The vector of SNPs.
std::vector<gftools::snp> snps;
/// The vector of individuals.
std::vector<gftools::individual> individuals;
/// An index of SNPs by name, mapping the name to an index in the BED data.
std::map<std::string, int> snp_index;
/** Constructor that creates and initializes named dataset.
* Implicitly opens the dataset in read mode.
*
* @param dataset The dataset name.
* @see open.
*/
plink_binary(std::string dataset);
/** Constructor that creates an unnamed, uninitialized instance.
*/
plink_binary();
/** The destructor.
*/
~plink_binary();
/** Initializes a named Plink dataset.
*
* @param dataset The dataset name.
* @param mode A read/write mode, true meaning write.
*/
void open(std::string dataset, bool mode);
/** Initializes a named Plink dataset in read mode.
*
* @param dataset The dataset name.
*/
void open(std::string dataset);
/** Closes an initialized Plink dataset.
*/
void close(void);
/** Populates a SNP vector with SNP data from a Plink BIM format file.
*
* The BIM file name is calculated from the current dataset name. This
* file is opened, parsed and then closed.
*
* @param snps A reference to a vector of SNPs.
*/
void read_bim(std::vector<gftools::snp> &snps);
/** Writes SNP data to a Plink BIM format file.
*
* The BIM file name is calculated from the current dataset name. This
* file is opened, written and then closed.
*
* @param snps A reference to a vector of SNPs.
*/
void write_bim(std::vector<gftools::snp> snps);
/** Makes a SNP from a Plink BIM format record.
*
* @param record A string.
* @return A SNP.
*/
gftools::snp from_bim(std::string record);
/** Makes a Plink BIM record string from a SNP.
*
* @param snp A SNP.
* @return A BIM record.
*/
std::string to_bim(gftools::snp snp);
/** Populates a vector with individual data from a Plink FAM format file.
*
* The FAM file name is calculated from the current dataset name. This
* file is opened, parsed and then closed.
*
* @param ind A reference to an individual.
*/
void read_fam(std::vector<gftools::individual> &ind);
/** Writes individual data to a Plink FAM format file.
*
* The FAM file name is calculated from the current dataset name. This
* file is opened, written and then closed.
*
* @param ind A reference to an individual.
*/
void write_fam(std::vector<gftools::individual> ind);
/** Makes an individual from a Plink FAM record.
*
* @param record A string.
* @return An individual.
*/
gftools::individual from_fam(std::string record);
/** Makes a Plink FAM record string from an individual.
*
* @param ind An individual.
* @return A FAM record.
*/
std::string to_fam(gftools::individual ind);
/** Decodes the next SNP and its genotype calls from BED data.
*
* @param snp A SNP reference that will be pointed to the next SNP.
* @param genotypes A vector of strings indicating the genotype calls
* for the SNP.
* @return true if there are further SNPs.
*/
bool next_snp(gftools::snp &snp, std::vector<std::string> &genotypes);
/** Looks up a SNP by name and updates a vector of genotype strings to the
* calls for that SNP.
*
* @param snp A SNP name.
* @param genotypes A vector of genotype call strings.
*/
void read_snp(std::string snp, std::vector<std::string> &genotypes);
/** Looks up a SNP by index in the BED data and updates a vector of genotype
* call integer codes for that SNP.
*
* @param snp A SNP index.
* @param genotypes A vector of genotype call codes.
*/
void read_snp(int snp, std::vector<int> &genotypes);
/** Writes the data of a SNP and its corresponding genotypes into the BED data.
*
* Also pushes the SNP onto the vector of SNPs as a side-effect, so it looks
* like you don't want to call this at the wrong time. E.g. don't call
* read_snp prior to this or data may get written at the wrong file offset.
* This function writes at the current position in the BED data.
*
* @param snp A snp whose data will be written.
* @param genotypes A vector of genotype call integer codes.
*/
void write_snp(const gftools::snp snp, const std::vector<int> genotypes);
/** Writes the data of a SNP and its corresponding genotypes into the BED data.
*
* @see write_snp(gftools::snp snp, std::vector<int> genotypes)
*
* @param snp A snp whose data will be written.
* @param genotypes A vector of genotype call strings.
*/
void write_snp(gftools::snp snp, std::vector<std::string> genotypes);
void write_individual(std::vector<int> genotypes);
/** Translates integer representations of genotype calls for one SNP to their
* corresponding string representations.
*
* Essentially this is the inverse of genotypes_atoi, except that the SNP is
* not modified.
*
* @param snp The SNP whose genotype strings are to be generated.
* @param g_num A vector of genotype call integer codes, @see genotypes_atoi.
* @param g_str A vector of genotype call strings, updated according to
* the integer codes.
*/
void genotypes_itoa(gftools::snp snp, const std::vector<int> g_num, std::vector<std::string> &g_str);
/** Translates string representations of genotype calls for one SNP to their
* corresponding integer representations and updates the alleles of the SNP.
*
* The character '0' is reserved to mean no-call in the string
* representation, otherwise the character used to represent each allele is
* unimportant. However, if more than two alleles are called, an error will
* be raised. If either called allele is in disagreement with the expected
* alleles of the SNP, and error will be raised.
*
* The integer codes created for each genotype are:
*
* - AA: 1
* - AB: 2
* - BA: 2
* - BB: 3
*
* @param snp A SNP whose calls these are.
* @param g_str A vector of integers, updated according to the genptype calls
* strings.
* @param g_num A vector of genotype call strings for the SNP, each allele
* being a single character.
*/
void genotypes_atoi(gftools::snp &snp, const std::vector<std::string> g_str, std::vector<int> &g_num);
};
#endif // GFTOOLS_PLINK_BIN