diff --git a/src/common/MethpipeFiles.cpp b/src/common/MethpipeFiles.cpp index 53a0a9b..843d787 100644 --- a/src/common/MethpipeFiles.cpp +++ b/src/common/MethpipeFiles.cpp @@ -24,6 +24,7 @@ #include #include #include + #include "GenomicRegion.hpp" #include "smithlab_utils.hpp" #include "smithlab_os.hpp" @@ -49,8 +50,7 @@ void methpipe::load_cpgs(const string &cpgs_file, vector &cpgs, vector > &meths, - vector &reads) -{ + vector &reads) { string chrom, prev_chrom; size_t pos, prev_pos = 0; string strand, seq; @@ -58,9 +58,9 @@ methpipe::load_cpgs(const string &cpgs_file, size_t coverage; std::ifstream in(cpgs_file.c_str()); - string line = skip_header(in); // added + string line = skip_header(in); // added std::istringstream iss(line); //added - + iss >> chrom >> pos >> strand >> seq >> meth >> coverage; prev_chrom = chrom; prev_pos = pos; @@ -72,24 +72,24 @@ methpipe::load_cpgs(const string &cpgs_file, meths.back().first = static_cast(round(meth * coverage)); meths.back().second = static_cast(coverage - meths.back().first); - + while (in >> chrom >> pos >> strand >> seq >> meth >> coverage) { //changed // sanity check - if (chrom.empty() || strand.empty() || seq.empty() || - meth < 0.0 || meth > 1.0) { + if (chrom.empty() || strand.empty() || seq.empty() || + meth < 0.0 || meth > 1.0) { std::ostringstream oss; oss << chrom << "\t" << pos << "\t" << strand << "\t" << seq << "\t" << meth << "\t" << coverage << "\n"; throw SMITHLABException("Invalid input line:" + oss.str()); } - + // order check if (prev_chrom > chrom || (prev_chrom == chrom && prev_pos > pos)) { throw SMITHLABException("CpGs not sorted in file \"" + cpgs_file + "\""); } prev_chrom = chrom; prev_pos = pos; - + // append site cpgs.push_back(SimpleGenomicRegion(chrom, pos, pos+1)); reads.push_back(coverage); @@ -103,8 +103,7 @@ void methpipe::load_cpgs(const string &cpgs_file, vector &cpgs, vector > &meths, - vector &reads) -{ + vector &reads) { string chrom, prev_chrom; size_t pos, prev_pos = 0; string strand, seq; @@ -114,12 +113,12 @@ methpipe::load_cpgs(const string &cpgs_file, std::ifstream in(cpgs_file.c_str()); string line = skip_header(in); //added std::istringstream iss(line); //added - + iss >> chrom >> pos >> strand >> seq >> meth >> coverage; prev_chrom = chrom; prev_pos = pos; - // append site + // append site cpgs.push_back(GenomicRegion(chrom, pos, pos+1, seq, 0, strand[0])); reads.push_back(coverage); meths.push_back(std::make_pair(0.0, 0.0)); @@ -135,14 +134,14 @@ methpipe::load_cpgs(const string &cpgs_file, << seq << "\t" << meth << "\t" << coverage << "\n"; throw SMITHLABException("Invalid input line:" + oss.str()); } - + // order check if (prev_chrom > chrom || (prev_chrom == chrom && prev_pos > pos)) { throw SMITHLABException("CpGs not sorted in file \"" + cpgs_file + "\""); } prev_chrom = chrom; prev_pos = pos; - + // append site cpgs.push_back(GenomicRegion(chrom, pos, pos+1, seq, 0, strand[0])); reads.push_back(coverage); @@ -154,21 +153,21 @@ methpipe::load_cpgs(const string &cpgs_file, void methpipe::load_cpgs_old(const string &cpgs_file, - vector &cpgs, - vector > &meths, - vector &reads) + vector &cpgs, + vector > &meths, + vector &reads) { std::ifstream in(cpgs_file.c_str()); - string line = skip_header(in); // added + string line = skip_header(in); // added std::istringstream iss(line); //added GenomicRegion r, prev_site; - + iss >> r; // append site const string site_name(r.get_name()); const size_t coverage = atoi( - site_name.substr(site_name.find_first_of(":") + 1).c_str()); + site_name.substr(site_name.find_first_of(":") + 1).c_str()); const double meth = r.get_score(); cpgs.push_back(SimpleGenomicRegion(r)); reads.push_back(coverage); @@ -178,7 +177,7 @@ methpipe::load_cpgs_old(const string &cpgs_file, prev_site = r; - + while (in >> r) { // order check if (r < prev_site) { @@ -187,7 +186,7 @@ methpipe::load_cpgs_old(const string &cpgs_file, // append site const string site_name(r.get_name()); const size_t coverage = atoi( - site_name.substr(site_name.find_first_of(":") + 1).c_str()); + site_name.substr(site_name.find_first_of(":") + 1).c_str()); const double meth = r.get_score(); cpgs.push_back(SimpleGenomicRegion(r)); reads.push_back(coverage); @@ -201,21 +200,20 @@ methpipe::load_cpgs_old(const string &cpgs_file, void methpipe::load_cpgs_old(const string &cpgs_file, - vector &cpgs, - vector > &meths, - vector &reads) -{ + vector &cpgs, + vector > &meths, + vector &reads) { std::ifstream in(cpgs_file.c_str()); - string line = skip_header(in); // added + string line = skip_header(in); // added std::istringstream iss(line); //added GenomicRegion r, prev_site; - + iss >> r; // append site const string site_name(r.get_name()); const size_t coverage = atoi( - site_name.substr(site_name.find_first_of(":") + 1).c_str()); + site_name.substr(site_name.find_first_of(":") + 1).c_str()); const double meth = r.get_score(); cpgs.push_back(r); reads.push_back(coverage); @@ -225,7 +223,7 @@ methpipe::load_cpgs_old(const string &cpgs_file, prev_site = r; - + while (in >> r) { // order check if (r < prev_site) { @@ -234,7 +232,7 @@ methpipe::load_cpgs_old(const string &cpgs_file, // append site const string site_name(r.get_name()); const size_t coverage = atoi( - site_name.substr(site_name.find_first_of(":") + 1).c_str()); + site_name.substr(site_name.find_first_of(":") + 1).c_str()); const double meth = r.get_score(); cpgs.push_back(r); reads.push_back(coverage); @@ -252,7 +250,7 @@ methpipe::is_methpipe_file_single(const string &file) { std::ifstream in(file.c_str()); if (!in) throw SMITHLABException("could not open file: " + file); - + string line; if (!std::getline(in, line)) throw SMITHLABException("could not read file: " + file); @@ -263,82 +261,79 @@ methpipe::is_methpipe_file_single(const string &file) { size_t pos = 0, coverage = 0; double meth = 0.0; iss >> chrom >> pos >> strand >> name >> meth >> coverage; - + if (strand != "+" && strand != "-") return false; - + if (meth < 0.0 || meth > 1.0) return false; - + if (name.find_first_not_of("ACGTacgtpHXx") != string::npos) return false; - + return true; } static void -move_to_start_of_line(std::istream &in) -{ - char next; - while (in.good() && in.get(next) && next != '\n') +move_to_start_of_line(std::istream &in) { + char next; + while (in.good() && in.get(next) && next != '\n') { - in.unget(); - in.unget(); + in.unget(); + in.unget(); } - if (in.bad()) - // hope this only happens when hitting the start of the file - in.clear(); + if (in.bad()) + // hope this only happens when hitting the start of the file + in.clear(); } void methpipe::seek_site(std::istream &in, const std::string &chr, - const size_t idx) -{ - in.seekg(0, ios_base::beg); - const size_t begin_pos = in.tellg(); - in.seekg(0, ios_base::end); - size_t step_size = (static_cast(in.tellg()) - begin_pos)/2; - - in.seekg(0, ios_base::beg); - string low_chr; - size_t low_idx = 0; - in >> low_chr >> low_idx; - - // MAGIC: need the -2 here to get past the EOF and possibly a '\n' - in.seekg(-2, ios_base::end); - move_to_start_of_line(in); - string high_chr; - size_t high_idx; - in >> high_chr >> high_idx; - - size_t pos = step_size; + const size_t idx) { + in.seekg(0, ios_base::beg); + const size_t begin_pos = in.tellg(); + in.seekg(0, ios_base::end); + size_t step_size = (static_cast(in.tellg()) - begin_pos)/2; + + in.seekg(0, ios_base::beg); + string low_chr; + size_t low_idx = 0; + in >> low_chr >> low_idx; + + // MAGIC: need the -2 here to get past the EOF and possibly a '\n' + in.seekg(-2, ios_base::end); + move_to_start_of_line(in); + string high_chr; + size_t high_idx; + in >> high_chr >> high_idx; + + size_t pos = step_size; + in.seekg(pos, ios_base::beg); + move_to_start_of_line(in); + + while (step_size > 0) { + string mid_chr; + size_t mid_idx = 0; + in >> mid_chr >> mid_idx; + step_size /= 2; + if (chr < mid_chr || (chr == mid_chr && idx <= mid_idx)) { + std::swap(mid_chr, high_chr); + std::swap(mid_idx, high_idx); + pos -= step_size; + } + else { + std::swap(mid_chr, low_chr); + std::swap(mid_idx, low_idx); + pos += step_size; + } in.seekg(pos, ios_base::beg); move_to_start_of_line(in); - - while (step_size > 0) - { - string mid_chr; - size_t mid_idx = 0; - in >> mid_chr >> mid_idx; - step_size /= 2; - if (chr < mid_chr || (chr == mid_chr && idx <= mid_idx)) { - std::swap(mid_chr, high_chr); - std::swap(mid_idx, high_idx); - pos -= step_size; - } - else { - std::swap(mid_chr, low_chr); - std::swap(mid_idx, low_idx); - pos += step_size; - } - in.seekg(pos, ios_base::beg); - move_to_start_of_line(in); - } + } } std::istream& methpipe::read_site(std::istream &in, string &chrom, size_t &pos, string &strand, string &seq, - double &meth, size_t &coverage) { + double &meth, size_t &coverage) { string line = skip_header(in); std::istringstream iss(line); iss >> chrom >> pos >> strand >> seq >> meth >> coverage; @@ -357,8 +352,8 @@ methpipe::write_site(std::ostream &out, std::istream& methpipe::read_site_old(std::istream &in, string &chrom, size_t &pos, - string &strand, std::string &seq, - double &meth, size_t &coverage) + string &strand, std::string &seq, + double &meth, size_t &coverage) { string line = skip_header(in); //added std::istringstream iss(line);//added @@ -369,15 +364,14 @@ methpipe::read_site_old(std::istream &in, string &chrom, size_t &pos, coverage = atoi(name.substr(name.find(":") + 1).c_str()); } else if (!line.empty()) - throw SMITHLABException("Invalid input line:" + line); + throw SMITHLABException("Invalid input line:" + line); return in; } - -bool + +bool methpipe::write_site_old(std::ostream &out, const string &chrom, const size_t &pos, - const string &strand, const string &seq, - const double &meth, const size_t &coverage) -{ + const string &strand, const string &seq, + const double &meth, const size_t &coverage) { out << chrom << "\t" << pos << "\t" << pos + 1 << "\t" << (seq + ":" + smithlab::toa(coverage)) << "\t" << (coverage == 0 ? 0.0 : meth) << "\t" << strand << std::endl; @@ -388,25 +382,24 @@ methpipe::write_site_old(std::ostream &out, const string &chrom, const size_t &p // IO functions for methdiff output bool methpipe::write_methdiff_site(std::ostream &out, const std::string &chrom, - const size_t pos, const std::string &strand, - const std::string &seq, const double diffscore, - const size_t meth_a, const size_t unmeth_a, - const size_t meth_b, const size_t unmeth_b) -{ - out << chrom << "\t" << pos << "\t" << strand << "\t" << seq << "\t" - << diffscore << "\t" << meth_a << "\t" << unmeth_a << "\t" - << meth_b << "\t" << unmeth_b << std::endl; - return out; + const size_t pos, const std::string &strand, + const std::string &seq, const double diffscore, + const size_t meth_a, const size_t unmeth_a, + const size_t meth_b, const size_t unmeth_b) { + out << chrom << "\t" << pos << "\t" << strand << "\t" << seq << "\t" + << diffscore << "\t" << meth_a << "\t" << unmeth_a << "\t" + << meth_b << "\t" << unmeth_b << std::endl; + return out; } bool methpipe::read_methdiff_site(std::istream &in, std::string &chrom, - size_t &pos, std::string &strand, - std::string &seq, double &diffscore, - size_t &meth_a, size_t &unmeth_a, - size_t &meth_b, size_t &unmeth_b) -{ - in >> chrom >> pos >> strand >> seq >> diffscore >> meth_a >> unmeth_a >> meth_b >> unmeth_b; - return in; + size_t &pos, std::string &strand, + std::string &seq, double &diffscore, + size_t &meth_a, size_t &unmeth_a, + size_t &meth_b, size_t &unmeth_b) { + in >> chrom >> pos >> strand >> seq >> + diffscore >> meth_a >> unmeth_a >> meth_b >> unmeth_b; + return in; }