Skip to content

Commit

Permalink
Merge pull request #247 from smithlabcode/minor-fixes
Browse files Browse the repository at this point in the history
Fixing some issues with option parsing on macos
  • Loading branch information
andrewdavidsmith authored Oct 20, 2024
2 parents deaff4a + 9596b02 commit f2fe044
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 35 deletions.
31 changes: 20 additions & 11 deletions src/analysis/cpgbins.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,14 @@
#include "MSite.hpp"
#include "OptionParser.hpp"
#include "bsutils.hpp"
#include "xcounts_utils.hpp"
#include "smithlab_utils.hpp"
#include "xcounts_utils.hpp"

#include <bamxx.hpp>

#include <algorithm>
#include <charconv>
#include <cstdint>
#include <filesystem>
#include <fstream>
#include <iostream>
Expand All @@ -45,6 +46,8 @@ using std::ostream;
using std::runtime_error;
using std::size;
using std::string;
using std::uint32_t;
using std::uint64_t;
using std::unordered_map;
using std::vector;

Expand Down Expand Up @@ -75,13 +78,13 @@ format_levels_counter(const LevelsCounter &lc) {
return oss.str();
}


static unordered_map<string, uint64_t>
get_chrom_sizes(const string &chrom_sizes_file) {
unordered_map<string, uint64_t> chrom_sizes;

ifstream in(chrom_sizes_file);
if (!in) throw runtime_error("failed to open file: " + chrom_sizes_file);
if (!in)
throw runtime_error("failed to open file: " + chrom_sizes_file);

string line;
while (getline(in, line)) {
Expand All @@ -91,7 +94,8 @@ get_chrom_sizes(const string &chrom_sizes_file) {
if (!(iss >> chrom_name >> chrom_size))
throw runtime_error("bad line in " + chrom_sizes_file + ":\n" + line);
string dummy;
if (iss >> dummy) throw runtime_error("too many columns: " + line);
if (iss >> dummy)
throw runtime_error("too many columns: " + line);

if (chrom_sizes.find(chrom_name) != cend(chrom_sizes))
throw runtime_error("repeated entry " + chrom_name + " in " +
Expand All @@ -105,7 +109,8 @@ get_chrom_sizes(const string &chrom_sizes_file) {
static vector<string>
get_chrom_names(const string &chrom_sizes_file) {
ifstream in(chrom_sizes_file);
if (!in) throw runtime_error("failed to open file: " + chrom_sizes_file);
if (!in)
throw runtime_error("failed to open file: " + chrom_sizes_file);

vector<string> chrom_names;

Expand All @@ -121,7 +126,6 @@ get_chrom_names(const string &chrom_sizes_file) {
return chrom_names;
}


static void
update(LevelsCounter &lc, const xcounts_entry &xce) {
const uint64_t n_reads = xce.n_meth + xce.n_unmeth;
Expand Down Expand Up @@ -149,7 +153,8 @@ process_chrom(const bool report_more_info, const char level_code,

uint64_t j = 0;
for (auto i = 0ul; i < chrom_size; i += bin_size) {
while (j < size(sites) && sites[j].pos < i) ++j;
while (j < size(sites) && sites[j].pos < i)
++j;

LevelsCounter lc;
while (j < size(sites) && sites[j].pos < i + bin_size)
Expand All @@ -166,7 +171,8 @@ process_chrom(const bool report_more_info, const char level_code,
: (level_code == 'u' ? lc.sites_covered
: lc.total_called()))));
out << r;
if (report_more_info) out << '\t' << format_levels_counter(lc);
if (report_more_info)
out << '\t' << format_levels_counter(lc);
out << '\n';
}
}
Expand All @@ -182,7 +188,8 @@ process_chrom(const bool report_more_info, const string &chrom_name,
r.set_start(i);
r.set_end(std::min(i + bin_size, chrom_size));
out << r;
if (report_more_info) out << '\t' << lc_formatted;
if (report_more_info)
out << '\t' << lc_formatted;
out << '\n';
}
}
Expand Down Expand Up @@ -210,7 +217,8 @@ Columns (beyond the first 6) in the BED format output:
bool verbose = false;
bool report_more_info = false;
uint32_t n_threads = 1;
uint64_t bin_size = 1000;
// uint64_t bin_size = 1000;
size_t bin_size = 1000; // ADS: for macOS gcc-14.2.0
string level_code = "w";
string outfile;

Expand Down Expand Up @@ -266,7 +274,8 @@ Columns (beyond the first 6) in the BED format output:
std::ofstream of;
if (!outfile.empty()) {
of.open(outfile);
if (!of) throw runtime_error("failed to open outfile: " + outfile);
if (!of)
throw runtime_error("failed to open outfile: " + outfile);
}
std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf());

Expand Down
66 changes: 42 additions & 24 deletions src/utils/clean-hairpins.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

#include <algorithm>
#include <array>
#include <cstddef> // std::size_t
#include <cstdint> // std::uint32_t and std::uint64_t
#include <fstream>
#include <iomanip>
#include <iostream>
Expand All @@ -45,7 +47,10 @@ using std::ofstream;
using std::ostream;
using std::ostringstream;
using std::runtime_error;
using std::size_t;
using std::string;
using std::uint32_t;
using std::uint64_t;
using std::vector;

using bamxx::bgzf_file;
Expand Down Expand Up @@ -93,22 +98,28 @@ operator>>(bgzf_file &s, FASTQRecord &r) {

err_code ec = err_code::none;

if (!getline(s, r.name)) return s;
if (!getline(s, r.name))
return s;

if (r.name.empty() || r.name[0] != '@') ec = err_code::bad_name;
if (r.name.empty() || r.name[0] != '@')
ec = err_code::bad_name;

const auto nm_end = r.name.find_first_of(" \t");
const auto nm_sz = (nm_end == string::npos ? r.name.size() : nm_end) - 1;
r.name.erase(copy_n(cbegin(r.name) + 1, nm_sz, begin(r.name)), cend(r.name));

if (!getline(s, r.seq)) ec = err_code::bad_seq;
if (!getline(s, r.seq))
ec = err_code::bad_seq;

string tmp;
if (!getline(s, tmp)) ec = err_code::bad_plus;
if (!getline(s, tmp))
ec = err_code::bad_plus;

if (!getline(s, r.qual)) ec = err_code::bad_qual;
if (!getline(s, r.qual))
ec = err_code::bad_qual;

if (ec != err_code::none) throw error_msg[ec];
if (ec != err_code::none)
throw error_msg[ec];

return s;
}
Expand Down Expand Up @@ -171,15 +182,13 @@ struct hp_summary {
// sum_percent_match_bad over the total hairpin reads.
double mean_percent_match_hairpin{};

auto
assign_values() -> void {
auto assign_values() -> void {
mean_percent_match_non_hairpin =
sum_percent_match_good / (n_reads - n_hairpin_reads);
mean_percent_match_hairpin = sum_percent_match_bad / n_hairpin_reads;
}

auto
tostring() const -> string {
auto tostring() const -> string {
ostringstream oss;
oss << "total_reads_pairs: " << n_reads << '\n'
<< "good_reads_pairs: " << n_good_reads << '\n'
Expand All @@ -197,7 +206,8 @@ struct hp_summary {
static void
write_histogram(const string &hist_outfile, vector<double> hist) {
ofstream hist_out(hist_outfile);
if (!hist_out) throw runtime_error("failed to open file: " + hist_outfile);
if (!hist_out)
throw runtime_error("failed to open file: " + hist_outfile);
const auto total = accumulate(cbegin(hist), cend(hist), 0.0);
transform(cbegin(hist), cend(hist), begin(hist),
[&total](const double t) { return t / total; });
Expand All @@ -213,7 +223,8 @@ static void
write_statistics(const string &filename, hp_summary hps) {
hps.assign_values();
ofstream out(filename);
if (!out) throw runtime_error("failed to open file: " + filename);
if (!out)
throw runtime_error("failed to open file: " + filename);
out << hps.tostring();
}

Expand All @@ -226,17 +237,18 @@ fraction_good_bases(const FASTQRecord &a, const FASTQRecord &b) {

struct clean_hairpin {
double cutoff{0.95};
uint64_t n_reads_to_check{std::numeric_limits<size_t>::max()};
// ADS: this was uint64_t but g++-14.2.0 on macOS had a problem
size_t n_reads_to_check{std::numeric_limits<size_t>::max()};
double min_good_base_percent{0.5};
uint32_t min_read_length{32};
uint32_t n_hist_bins{20};
bool invert_output{false};

hp_summary
analyze_reads(const string &outfile1, const string &outfile2, bgzf_file &in1,
bgzf_file &in2, vector<double> &hist) const;
hp_summary
analyze_reads(bgzf_file &in1, bgzf_file &in2, vector<double> &hist) const;
hp_summary analyze_reads(const string &outfile1, const string &outfile2,
bgzf_file &in1, bgzf_file &in2,
vector<double> &hist) const;
hp_summary analyze_reads(bgzf_file &in1, bgzf_file &in2,
vector<double> &hist) const;
};

hp_summary
Expand All @@ -246,9 +258,11 @@ clean_hairpin::analyze_reads(const string &outfile1, const string &outfile2,

// output files for read1 and read2 with hairpins removed
bgzf_file out1(outfile1, "w");
if (!out1) throw runtime_error("cannot open output file: " + outfile1);
if (!out1)
throw runtime_error("cannot open output file: " + outfile1);
bgzf_file out2(outfile2, "w");
if (!out2) throw runtime_error("cannot open output file: " + outfile2);
if (!out2)
throw runtime_error("cannot open output file: " + outfile2);

hp_summary hps{cutoff};
FASTQRecord r1, r2;
Expand Down Expand Up @@ -384,17 +398,21 @@ main_clean_hairpins(int argc, const char **argv) {

// Input: paired-end reads with end1 and end2
bgzf_file in1(reads_file1, "r");
if (!in1) throw runtime_error("cannot open input file: " + reads_file1);
if (!in1)
throw runtime_error("cannot open input file: " + reads_file1);
bgzf_file in2(reads_file2, "r");
if (!in2) throw runtime_error("cannot open input file: " + reads_file2);
if (!in2)
throw runtime_error("cannot open input file: " + reads_file2);

const hp_summary hps =
write_reads ? ch.analyze_reads(outfile1, outfile2, in1, in2, hist)
: ch.analyze_reads(in1, in2, hist);

if (!stat_outfile.empty()) write_statistics(stat_outfile, hps);
if (!stat_outfile.empty())
write_statistics(stat_outfile, hps);

if (!hist_outfile.empty()) write_histogram(hist_outfile, hist);
if (!hist_outfile.empty())
write_histogram(hist_outfile, hist);
}
catch (const std::exception &e) {
cerr << e.what() << endl;
Expand Down

0 comments on commit f2fe044

Please sign in to comment.