Skip to content

Commit

Permalink
Merge pull request #78 from smithlabcode/add-option-for-empty-input
Browse files Browse the repository at this point in the history
Adding option to exit without error on empty input
  • Loading branch information
andrewdavidsmith authored Nov 21, 2024
2 parents e9777a9 + 9945aeb commit 6e6e3c6
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 45 deletions.
89 changes: 54 additions & 35 deletions src/Module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cmath>
#include <sstream>
#include <cstdlib>
#include <cassert>

using std::string;
using std::vector;
Expand All @@ -37,6 +38,8 @@ using std::ostringstream;
using std::istringstream;
using std::getline;

template<typename T> using num_lim = std::numeric_limits<T>;

/*****************************************************************************/
/******************* AUX FUNCTIONS *******************************************/
/*****************************************************************************/
Expand Down Expand Up @@ -97,7 +100,7 @@ make_exponential_base_groups(vector<BaseGroup> &base_groups,
/************* LINEAR BASE GROUP *************/
// aux function to get linear interval
size_t
get_linear_interval(const size_t &num_bases) {
get_linear_interval(const size_t num_bases) {
// The the first 9bp as individual residues since odd stuff
// can happen there, then we find a grouping value which gives
// us a total set of groups below 75. We limit the intervals
Expand Down Expand Up @@ -174,7 +177,8 @@ double get_corrected_count(size_t count_at_limit,
size_t num_reads,
size_t dup_level,
size_t num_obs) {
// See if we can bail out early
// See if we can bail out early (ADS: can we know if num_reads <=
// count_at_limit always holds?)
if (count_at_limit == num_reads)
return num_obs;

Expand Down Expand Up @@ -210,7 +214,7 @@ double get_corrected_count(size_t count_at_limit,

// Now we can assume that the number we observed can be
// scaled up by this proportion
return num_obs/(1 - p_not_seeing);
return num_obs/std::max(num_lim<double>::min(), 1.0 - p_not_seeing);
}

// Function to calculate the deviation of a histogram with 100 bins from a
Expand Down Expand Up @@ -277,7 +281,8 @@ sum_deviation_from_normal(const array <double, 101> &gc_count,
// centre of the model
mode = first_mode;
} else {
mode /= mode_duplicates;
// ADS: check if we need to avoid divide-by-zero here
mode /= std::max(static_cast<size_t>(1), mode_duplicates);
}

// We can now work out a theoretical distribution
Expand All @@ -286,7 +291,8 @@ sum_deviation_from_normal(const array <double, 101> &gc_count,
stdev += (i - mode) * (i - mode) * gc_count[i];
}

stdev = stdev / (total_count-1);
// ADS: check if we need to avoid divide-by-zero here
stdev = stdev / std::max(num_lim<double>::min(), total_count - 1.0);
stdev = sqrt(stdev);

/******************* END COPIED FROM FASTQC **********************/
Expand All @@ -297,20 +303,24 @@ sum_deviation_from_normal(const array <double, 101> &gc_count,
// ADS: lonely magic below; what is the 100?
for (size_t i = 0; i <= 100; ++i) {
z = i - mode;
// ADS: check if we need to avoid divide-by-zero here
theoretical[i] = exp(- (z*z)/ (2.0 * stdev *stdev));
theoretical_sum += theoretical[i];
}

// Normalize theoretical so it sums to the total of readsq
for (size_t i = 0; i <= 100; ++i) {
theoretical[i] = theoretical[i] * total_count / theoretical_sum;
// ADS: check if we need to avoid divide-by-zero here
theoretical[i] = theoretical[i] * total_count /
std::max(num_lim<double>::min(), theoretical_sum);
}

for (size_t i = 0; i <= 100; ++i) {
ans += fabs(gc_count[i] - theoretical[i]);
}
// Fractional deviation
return 100.0 * ans / total_count;
// Fractional deviation (ADS: check if we need to avoid
// divide-by-zero here)
return 100.0 * ans / std::max(num_lim<double>::min(), total_count);
}

/***************************************************************/
Expand Down Expand Up @@ -446,15 +456,16 @@ ModuleBasicStatistics::summarize_module(FastqStats &stats) {
total_bases += i * stats.long_read_length_freq[i - FastqStats::SHORT_READ_THRESHOLD];
}

avg_read_length = total_bases / total_sequences;
avg_read_length =
total_bases / std::max(static_cast<size_t>(1), total_sequences);

// counts bases G and C in each base position
avg_gc = 0;

// GC %
// GS: TODO delete gc calculation during stream and do it using the total G
// counts in all bases
avg_gc = 100 * stats.total_gc / static_cast<double>(total_bases);
avg_gc = 100 * stats.total_gc / std::max(1.0, static_cast<double>(total_bases));

}

Expand Down Expand Up @@ -692,6 +703,7 @@ ModulePerBaseSequenceQuality::summarize_module(FastqStats &stats) {
}

const size_t base_positions = base_groups[group].end - base_groups[group].start + 1;
assert(base_positions != static_cast<size_t>(0));
group_mean[group] = mean_group_sum / base_positions;
group_ldecile[group] = static_cast<double>(ldecile_group_sum) / base_positions;
group_lquartile[group] = static_cast<double>(lquartile_group_sum) / base_positions;
Expand Down Expand Up @@ -819,17 +831,19 @@ ModulePerTileSequenceQuality::summarize_module(FastqStats &stats) {

// Now transform sum into mean
for (size_t i = 0; i < max_read_length; ++i)
if (position_counts[i] > 0)
if (position_counts[i] > 0.0)
mean_in_base[i] = mean_in_base[i] / position_counts[i];
else
mean_in_base[i] = 0;
mean_in_base[i] = 0.0;

for (auto &v : tile_position_quality) {
const size_t lim = v.second.size();
for (size_t i = 0; i < lim; ++i) {
// transform sum of all qualities in mean
const size_t count_at_pos =
stats.tile_position_count.find(v.first)->second[i];
const auto itr = stats.tile_position_count.find(v.first);
if (itr == cend(stats.tile_position_count))
throw runtime_error("failure ModulePerTileSequenceQuality::summarize_module");
const size_t count_at_pos = itr->second[i];

if (count_at_pos > 0)
v.second[i] = v.second[i] / count_at_pos;
Expand Down Expand Up @@ -882,6 +896,7 @@ ModulePerTileSequenceQuality::write_module(ostream &os) {

inline double
round_quantile(const double val, const double num_quantiles) {
// ADS: check if we need to worry about divide by zero here
return static_cast<int>(val * num_quantiles) / num_quantiles;
}

Expand Down Expand Up @@ -937,6 +952,7 @@ ModulePerTileSequenceQuality::make_html_data() {
// We will now discretize the quantiles so plotly understands
// the color scheme
static const double num_quantiles = 20.0;
// ADS: not sure if we need to worry about divide by zero here?
double mid_point = round_quantile(min_val/(min_val - max_val), num_quantiles);

// - 10: red
Expand Down Expand Up @@ -1054,7 +1070,7 @@ Module(ModulePerBaseSequenceContent::module_name) {
void
ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) {
double a_group, t_group, g_group, c_group, n_group;
double a_pos, t_pos, g_pos, c_pos, n_pos;
double a_pos{}, t_pos{}, g_pos{}, c_pos{}, n_pos{};
double total; //a+c+t+g+n
max_diff = 0.0;

Expand Down Expand Up @@ -1105,10 +1121,10 @@ ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) {

const double total_pos =
static_cast<double>(a_pos + c_pos + g_pos + t_pos + n_pos);
a_pos = 100.0 * a_pos / total_pos;
c_pos = 100.0 * c_pos / total_pos;
g_pos = 100.0 * g_pos / total_pos;
t_pos = 100.0 * t_pos / total_pos;
a_pos = 100.0 * a_pos / std::max(num_lim<double>::min(), total_pos);
c_pos = 100.0 * c_pos / std::max(num_lim<double>::min(), total_pos);
g_pos = 100.0 * g_pos / std::max(num_lim<double>::min(), total_pos);
t_pos = 100.0 * t_pos / std::max(num_lim<double>::min(), total_pos);

// for WGBS, we only test non-bisulfite treated bases
if (!is_reverse_complement)
Expand All @@ -1135,11 +1151,10 @@ ModulePerBaseSequenceContent::summarize_module(FastqStats &stats) {

// turns above values to percent
total = static_cast<double>(a_group + c_group + t_group + g_group + n_group);
a_pct[group] = 100.0*a_group / total;
c_pct[group] = 100.0*c_group / total;
g_pct[group] = 100.0*g_group / total;
t_pct[group] = 100.0*t_group / total;

a_pct[group] = 100.0*a_group / std::max(num_lim<double>::min(), total);
c_pct[group] = 100.0*c_group / std::max(num_lim<double>::min(), total);
g_pct[group] = 100.0*g_group / std::max(num_lim<double>::min(), total);
t_pct[group] = 100.0*t_group / std::max(num_lim<double>::min(), total);
}
}

Expand Down Expand Up @@ -1395,12 +1410,14 @@ ModulePerBaseNContent::summarize_module(FastqStats &stats) {

this_n_total = (i < FastqStats::SHORT_READ_THRESHOLD) ? (stats.cumulative_read_length_freq[i]) :
(stats.long_cumulative_read_length_freq[i - FastqStats::SHORT_READ_THRESHOLD]);
this_n_pct = this_n_cnt / static_cast<double>(this_n_total);
this_n_pct = this_n_cnt / std::max(num_lim<double>::min(),
static_cast<double>(this_n_total));
max_n_pct = max(max_n_pct, this_n_pct);
group_n_cnt += this_n_cnt;
group_n_total += this_n_total;
}
n_pct[group] = 100.0*group_n_cnt / static_cast<double>(group_n_total);
n_pct[group] = 100.0*group_n_cnt / std::max(num_lim<double>::min(),
static_cast<double>(group_n_total));
}
}

Expand Down Expand Up @@ -1627,15 +1644,15 @@ ModuleSequenceDuplicationLevels::summarize_module(FastqStats &stats) {
}

// "Sequence duplication estimate" in the summary
total_deduplicated_pct = 100.0 * seq_dedup / seq_total;
total_deduplicated_pct = 100.0 * seq_dedup / std::max(1.0, seq_total);

// Convert to percentage
for (auto &v : percentage_deduplicated)
v = 100.0 * v / seq_dedup; // Percentage of unique sequences in bin
v = 100.0 * v / std::max(1.0, seq_dedup); // Percentage of unique sequences in bin

// Convert to percentage
for (auto &v : percentage_total)
v = 100.0 * v / seq_total; // Percentage of sequences in bin
v = 100.0 * v / std::max(1.0, seq_total); // Percentage of sequences in bin
}

void
Expand Down Expand Up @@ -1796,7 +1813,7 @@ ModuleOverrepresentedSequences::make_grade() {
// implment pass warn fail for overrep sequences
if (grade != "fail") {
// get percentage that overrep reads represent
double pct = 100.0 * seq.second / num_reads;
double pct = 100.0 * seq.second / std::max(static_cast<size_t>(1), num_reads);
if (pct > grade_error) {
grade = "fail";
}
Expand All @@ -1813,7 +1830,7 @@ ModuleOverrepresentedSequences::write_module(ostream &os) {
os << "#Sequence\tCount\tPercentage\tPossible Source\n";
for (auto seq : overrep_sequences) {
os << seq.first << "\t" << seq.second << "\t" <<
100.0 * seq.second / num_reads << "\t"
100.0 * seq.second / std::max(static_cast<size_t>(1), num_reads) << "\t"
<< get_matching_contaminant(seq.first) << "\n";
}
}
Expand All @@ -1836,7 +1853,7 @@ ModuleOverrepresentedSequences::make_html_data() {
for (auto v : overrep_sequences) {
data << "<tr><td>" << v.first << "</td>";
data << "<td>" << v.second << "</td>";
data << "<td>" << 100.0 * v.second / num_reads << "</td>";
data << "<td>" << 100.0 * v.second / std::max(static_cast<size_t>(1), num_reads) << "</td>";
data << "<td>" << get_matching_contaminant(v.first)
<< "</td>";
data << "</tr>";
Expand Down Expand Up @@ -1907,7 +1924,8 @@ ModuleAdapterContent::summarize_module(FastqStats &stats) {
for (size_t i = 0; i < adapter_pos_pct.size(); ++i) {
for (size_t j = 0; j < adapter_pos_pct[0].size(); ++j) {
adapter_pos_pct[i][j] *= 100.0;
adapter_pos_pct[i][j] /= static_cast<double>(stats.num_reads);
adapter_pos_pct[i][j] /= std::max(num_lim<double>::min(),
static_cast<double>(stats.num_reads));
}
}
}
Expand Down Expand Up @@ -2077,7 +2095,8 @@ ModuleKmerContent::summarize_module(FastqStats &stats) {
observed_count =
stats.kmer_count[(i << Constants::bit_shift_kmer) | kmer];

expected_count = pos_kmer_count[i] / dividend;
expected_count = pos_kmer_count[i] / std::max(num_lim<double>::min(), dividend);
// ADS: below, denom can't be zero if not above?
obs_exp_ratio = (expected_count > 0) ? (observed_count / expected_count) : 0;

if (i == 0 || obs_exp_ratio > obs_exp_max[kmer]) {
Expand Down Expand Up @@ -2146,7 +2165,7 @@ ModuleKmerContent::make_html_data() {

for (size_t i = 0; i < lim; ++i) {
const size_t kmer = kmers_to_report[i].first;
const double log_obs_exp = log(kmers_to_report[i].second)/log(2);
const double log_obs_exp = log(kmers_to_report[i].second)/log(2.0);
if (!seen_first)
seen_first = true;
else
Expand Down
43 changes: 33 additions & 10 deletions src/falco.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/

#include <chrono>
#include <filesystem>
#include <fstream>

#include "FalcoConfig.hpp"
Expand All @@ -37,6 +38,8 @@ using std::string;
using std::to_string;
using std::vector;

namespace fs = std::filesystem;

using std::chrono::duration_cast;
using std::chrono::system_clock;
using time_point = std::chrono::time_point<std::chrono::system_clock>;
Expand All @@ -61,12 +64,7 @@ log_process(const string &s) {
// Function to check existance of directory
static bool
dir_exists(const string &path) {
struct stat info;
if (stat(path.c_str(), &info) != 0)
return false;
else if (info.st_mode & S_IFDIR)
return true;
return false;
return fs::exists(path) && fs::is_directory(path);
}

// Read any file type until the end and logs progress
Expand All @@ -75,7 +73,7 @@ template <typename T>
void
read_stream_into_stats(T &in, FastqStats &stats, FalcoConfig &falco_config) {
// open file
size_t file_size = in.load();
size_t file_size = std::max(in.load(), static_cast<size_t>(1));
size_t tot_bytes_read = 0;

// Read record by record
Expand All @@ -90,11 +88,10 @@ read_stream_into_stats(T &in, FastqStats &stats, FalcoConfig &falco_config) {

// if I could not get tile information from read names, I need to tell this to
// config so it does not output tile data on the summary or html
if (in.tile_ignore) {
if (in.tile_ignore)
falco_config.do_tile = false;
}

if (tot_bytes_read < file_size && !quiet)
if (!quiet && tot_bytes_read < file_size)
progress.report(cerr, file_size);
}

Expand Down Expand Up @@ -293,6 +290,7 @@ main(int argc, const char **argv) {
bool skip_html = false;
bool skip_short_summary = false;
bool do_call = false;
bool allow_empty_input = false;

// a tmp boolean to keep compatibility with FastQC
bool tmp_compatibility_only = false;
Expand Down Expand Up @@ -542,6 +540,14 @@ main(int argc, const char **argv) {
" in programs that are very strict about the "
" FastQC output format).",
false, do_call);

opt_parse.add_opt(
"allow-empty-input", '\0',
"[Falco only] allow empty input files and generate empty output files "
"without en error state. WARNING: using this option can mask problems in "
"other parts of a workflow.",
false, allow_empty_input);

vector<string> leftover_args;
opt_parse.parse(argc, argv, leftover_args);
if (argc == 1 || opt_parse.help_requested()) {
Expand Down Expand Up @@ -578,6 +584,23 @@ main(int argc, const char **argv) {
return EXIT_FAILURE;
}

// ADS: make sure all input files are non-empty unless user oks it
if (!allow_empty_input) {
for (const auto &fn : leftover_args) {
std::error_code ec;
const bool empty_file = std::filesystem::is_empty(fn, ec);
if (ec) {
cerr << "Error reading file: " << fn << " (" << ec.message() << ")"
<< endl;
return EXIT_FAILURE;
}
else if (empty_file) {
cerr << "Input file is empty: " << fn << endl;
return EXIT_FAILURE;
}
}
}

if (!outdir.empty()) {
if (!summary_filename.empty())
cerr << "[WARNING] specifying custom output directory but also "
Expand Down

0 comments on commit 6e6e3c6

Please sign in to comment.