From b571bc782877d3c06481ee33d73e0a4f61bdef4b Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 18 Oct 2019 10:16:44 +0200 Subject: [PATCH 01/61] Fix bug in logic for outputting non_cpg positions (classic gemBS format) --- README.md | 1 + tools/gemBS_plugins/mextr.c | 18 +++++--- tools/gemBS_plugins/mextr.h | 1 + tools/gemBS_plugins/output.c | 83 +++++++++++++++++++++++++++++++++++- 4 files changed, 95 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6fb409ef..34c13539 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Documentation can be found at Changelog: ---------- + 3.3.2 Fix generation of non_cpg files 3.3.1 Fix Attribute error bug due to not checking if conversion is a list 3.3.0 Make new release for IHEC 3.3.0 Switch conversion default in IHEC_standard configuration to 0.01,0.05 rather than auto, which can give odd results if conversion controls not present or not working correctly diff --git a/tools/gemBS_plugins/mextr.c b/tools/gemBS_plugins/mextr.c index 8cc10f39..c4ce9516 100644 --- a/tools/gemBS_plugins/mextr.c +++ b/tools/gemBS_plugins/mextr.c @@ -462,8 +462,7 @@ static fmt_field_t tags[] = { bcf1_t *process(bcf1_t *rec) { static int idx; - static int32_t curr_rid = -1, prev_pos; - static bool valid[2] = {false, false}; + static int32_t curr_rid = -1, prev_pos = -1; static bcf1_t prev_rec; int ns = bcf_hdr_nsamples(args.hdr); @@ -479,7 +478,7 @@ bcf1_t *process(bcf1_t *rec) break; } } - if(cg || args.output_noncpg) { // Site with potentially Cs or Gs (or we are outputting non_cpgs) + if(cg) { // Site with potentially Cs or Gs bcf_unpack(rec, BCF_UN_ALL); // Get format tags for(int ix = 0; tags[ix].tag != NULL; ix++) { @@ -561,17 +560,24 @@ bcf1_t *process(bcf1_t *rec) } } } - valid[idx] = true; // Here is the logic for deciding what we print + + // check if we are next to the previous record + bool consec = false; if(rec->rid != curr_rid) curr_rid = rec->rid; - else if(rec->pos - prev_pos == 1 && valid[idx ^ 1]) output_cpg(&args, &prev_rec, tags, sample_gt, idx ^ 1, sample_cpg, sample_Q); + else if(rec->pos - prev_pos == 1) consec = true; + if(consec) { + output_cpg(&args, &prev_rec, tags, sample_gt, idx ^ 1, sample_cpg, sample_Q); + } else if(args.output_noncpg && prev_pos >= 0) { + output_nonconsec_noncpg(&args, &prev_rec, tags, sample_gt, idx ^ 1, true, sample_cpg, sample_Q); + output_nonconsec_noncpg(&args, rec, tags, sample_gt, idx, false, sample_cpg, sample_Q); + } if(args.bedmethyl || args.wigfile) { output_bedmethyl(&args, rec, tags, sample_gt, idx); } idx ^= 1; prev_pos = rec->pos; memcpy(&prev_rec, rec, sizeof(bcf1_t)); - valid[idx] = false; if(st != NULL) st->n_sites_pass++; } } diff --git a/tools/gemBS_plugins/mextr.h b/tools/gemBS_plugins/mextr.h index 68ce3b5b..4b8fd9a4 100644 --- a/tools/gemBS_plugins/mextr.h +++ b/tools/gemBS_plugins/mextr.h @@ -102,6 +102,7 @@ void calc_gt_prob(gt_meth *gt, args_t *args, char rf); void calc_cpg_meth(args_t *args, int ns, cpg_prob *cpg, gt_meth *g1, gt_meth *g2); double get_meth(gt_meth *g, int idx); void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx, cpg_prob *sample_cpg, double *Q[]); +void output_nonconsec_noncpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx, bool first, cpg_prob *sample_cpg, double *Q[]); void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx); void fill_base_prob_table(void); diff --git a/tools/gemBS_plugins/output.c b/tools/gemBS_plugins/output.c index 360a03cc..de73b717 100644 --- a/tools/gemBS_plugins/output.c +++ b/tools/gemBS_plugins/output.c @@ -209,7 +209,7 @@ void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { gt_meth *g = sample_gt[idx ^ pos] + ix; if(!g->skip) { - int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype + int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype fprintf(fp, "\t%c\tGQ=%d", gt_iupac[g->max_gt], gq); if(g->max_gt != (pos ? 7 : 4)) { int dq = calc_phred(exp(g->gt_prob[pos ? 7 : 4])); // Prob. of being CG @@ -250,10 +250,89 @@ void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt } fputc('\n', fp); } - } + } } } +void output_nonconsec_noncpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx, bool first, cpg_prob *cpg, double *Q[]) { + static char *cx; + static int32_t cx_n; + static char *gt_iupac = "AMRWCSYGKT"; + static uint8_t gt_msk[] = {0x11, 0xb3, 0x55, 0x99, 0xa2, 0xf6, 0xaa, 0x54, 0xdc, 0x88}; + + FILE *fp = args->cpgfile; + int ns = bcf_hdr_nsamples(args->hdr); + int min_n = args->min_num; + int n1 = (int)(args->min_prop * (double)ns + 0.5); + if(n1 > min_n) min_n = n1; + fp=args->noncpgfile; + assert(fp != NULL); + for(int ix = 0; ix < ns; ix++) { + double z = 0.0; + gt_meth *g = sample_gt[idx] + ix; + if(!g->skip) { + if(first) { + if(g->counts[5] >= args->min_nc && (g->counts[5] + g->counts[7] >= args->min_inform)) { + if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[4]); + else z = exp(g->gt_prob[1]) + exp(g->gt_prob[4]) + exp(g->gt_prob[5]) + exp(g->gt_prob[6]); + } + } else { + if(g->counts[6] >= args->min_nc && (g->counts[6] + g->counts[4] >= args->min_inform)) { + if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[7]); + else z = exp(g->gt_prob[2]) + exp(g->gt_prob[5]) + exp(g->gt_prob[7]) + exp(g->gt_prob[8]); + } + } + } + Q[2][ix] = z; + } + double *p = get_prob_dist(ns, Q); + double z = p[0]; + for(int i = 1; i <= ns && i < min_n; i++) z += p[i]; + int phred = calc_phred(z); + if(phred >= args->sel_thresh) { + int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)&cx, &cx_n, BCF_HT_STR); + int cx_sz = tags[FMT_CX].st[idx].ne / ns; + int *mq_p = tags[FMT_MQ].st[idx].ne == ns ? tags[FMT_MQ].st[idx].dat_p : NULL; + fprintf(fp,"%s\t%d\t%d\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, cx_len >= 3 ? cx[2] : '.'); + char *cx_p = tags[FMT_CX].st[idx].dat_p; + for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { + gt_meth *g = sample_gt[idx] + ix; + if(!g->skip) { + int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype + fprintf(fp, "\t%c\tGQ=%d", gt_iupac[g->max_gt], gq); + if(g->max_gt != (first ? 4 : 7)) { + int dq = calc_phred(exp(g->gt_prob[first ? 4 : 7])); // Prob. of being CG + fprintf(fp, ";DQ=%d", dq); + } + int mq = -1; + if(mq_p != NULL) mq = mq_p[ix]; + if(mq >= 0) fprintf(fp, ";MQ=%d", mq); + if(cx_sz >= 5) fprintf(fp, ";CX=%.3s", cx_p + 2); + int32_t ct[4]; + if(!first) { + ct[0] = g->counts[6]; + ct[1] = g->counts[4]; + } else { + ct[0] = g->counts[5]; + ct[1] = g->counts[7]; + } + ct[2] = ct[3] = 0; + uint8_t m = 1; + uint8_t msk = gt_msk[g->max_gt]; + for(int i = 0; i < 8; i++, m <<= 1) { + ct[3] += g->counts[i]; + if(msk & m) ct[2] += g->counts[i]; + } + double meth = get_meth(g, !first); + fprintf(fp, "\t%g\t%d\t%d\t%d\t%d", meth, ct[0], ct[1], ct[2], ct[3]); + } else { + fputs("\t.\t.\t.\t.\t.\t.\t.\t.", fp); + } + } + fputc('\n', fp); + } +} + static char *rgb_tab[11] = { "0,255,0", "55,255,0", "105,255,0", "155,255,0", "205,255,0", "255,255,0", "255,205,0", "255,155,0", "255,105,0", "255,55,0", "255,0,0" }; From d1f917426bcc1e804cf808edb4d51782e9eaff46 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 18 Oct 2019 10:21:29 +0200 Subject: [PATCH 02/61] Fix bug in wig generations where header lines could be omitted --- README.md | 1 + gemBS/version.py | 2 +- tools/gemBS_plugins/output.c | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 34c13539..0fffce3a 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Documentation can be found at Changelog: ---------- + 3.3.2 Fix error where header line for wig files could be omitted 3.3.2 Fix generation of non_cpg files 3.3.1 Fix Attribute error bug due to not checking if conversion is a list 3.3.0 Make new release for IHEC diff --git a/gemBS/version.py b/gemBS/version.py index f4f99317..1ede770c 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "3" -__VERSION_SUBMINOR = "1" +__VERSION_SUBMINOR = "2" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/tools/gemBS_plugins/output.c b/tools/gemBS_plugins/output.c index de73b717..eea81a65 100644 --- a/tools/gemBS_plugins/output.c +++ b/tools/gemBS_plugins/output.c @@ -385,7 +385,7 @@ void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sam } int32_t cov = ct[0] + ct[1]; double m = cov > 0 ? (double)ct[0] / (double)cov : 0.0; - if(cov > 0 && args->wigfile) { + if(cov > 0) { FILE *fp = args->wigfile; if(fp != NULL) { if(rec->rid != old_rid) { @@ -393,6 +393,7 @@ void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sam } fprintf(fp, "%u\t%.4g\n", rec->pos + 1, 100.0 * m); } + old_rid = rec->rid; } FILE *fp = args->bedmethylfiles[btype]; if(fp != NULL) { @@ -401,7 +402,6 @@ void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sam args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, args->bedmethyl_desc, cov > 1000 ? 1000 : cov, strand, rec->pos, rec->pos + 1, rgb_tab[(int)(m * 10.0 + 0.5)], cov, (int)(100.0 * m), rtmp, rtmp + 4, gq); } - old_rid = rec->rid; old_pos = rec->pos; } } From 2be0e3374357ca8236fa21670b387b1f4fa1a204 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 25 Oct 2019 16:59:35 +0200 Subject: [PATCH 03/61] Fix issue "error in gemBS index #65" by removing unused legacy pathway --- README.md | 1 + gemBS/__init__.py | 21 ++------------------- gemBS/production.py | 2 +- gemBS/version.py | 2 +- tools/gem3-mapper | 2 +- 5 files changed, 6 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 0fffce3a..d3bccd68 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ Documentation can be found at Changelog: ---------- + 3.3.3 Remove legacy pathway for config files with no header line (fix issue 'error in gemBS index #65) 3.3.2 Fix error where header line for wig files could be omitted 3.3.2 Fix generation of non_cpg files 3.3.1 Fix Attribute error bug due to not checking if conversion is a list diff --git a/gemBS/__init__.py b/gemBS/__init__.py index a68853ba..695e58c1 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -335,9 +335,9 @@ def prepareConfiguration(text_metadata=None,lims_cnag_json=None,configFile=None, with open(text_metadata, 'r') as f: reader = csv.reader(f) try: - line = reader.__next__() + line = next(reader) except StopIteration: - raise ValueError('Empty configuration file'); + raise ValueError('Empty configuration file') header_found = {} col_desc = [] for i, entry in enumerate(line): @@ -432,23 +432,6 @@ def prepareConfiguration(text_metadata=None,lims_cnag_json=None,configFile=None, if len(file_dict) == 2 and not type in sampleDirectory: sampleDirectory['type'] = "PAIRED" generalDictionary['sampleData'][fli] = sampleDirectory - elif len(line) == 5: - # Parse as simple 5 field csv file (no header) - while True: - sampleDirectory = {} - sampleDirectory["sample_barcode"] = line[0].strip() - sampleDirectory["library_barcode"] = line[1].strip() - flowcell = line[2].strip() - lane = line[3].strip() - index = line[4].strip() - fli = "{}_{}_{}".format(flowcell, lane, index) - fli1 = "{}_{}_0".format(flowcell, lane) - sampleDirectory["alt_fli"] = fli1 - generalDictionary['sampleData'][fli] = sampleDirectory - try: - line = reader.next() - except StopIteration: - break else: raise ValueError('Could not parse config file') diff --git a/gemBS/production.py b/gemBS/production.py index 9386b9df..8fc9d6cf 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -62,7 +62,7 @@ class PrepareConfiguration(Command): Two files are required, a configuration file describing the model parameters and analysis directory structure, and second file describing the sample metadata and associated data files. - The sample file will normally be a text file in CSV format with an optional (although recommended) header line, + The sample file will normally be a text file in CSV format with a header line, although there is also the option to import a JSON file from the CNAG LIMS. A full description of the input file formats can be found in the gemBS documentation. diff --git a/gemBS/version.py b/gemBS/version.py index 1ede770c..24a79190 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "3" -__VERSION_SUBMINOR = "2" +__VERSION_SUBMINOR = "3" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/tools/gem3-mapper b/tools/gem3-mapper index 97f66ada..99021a6b 160000 --- a/tools/gem3-mapper +++ b/tools/gem3-mapper @@ -1 +1 @@ -Subproject commit 97f66ada5a28d00e42aeb99d09f1fcde14965268 +Subproject commit 99021a6b3f7bf3eaa1904b9a4c92efef61dea455 From 272fcabcf6cb7e9eee64f2a1b29946077a39ba86 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 9 Dec 2019 12:55:19 +0100 Subject: [PATCH 04/61] Move to new bscall version --- tools/Makefile | 8 ++++---- tools/bs_call | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index 3f5bcbc4..0588f3d0 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -10,8 +10,8 @@ ROOT_PATH=$(CURDIR) # samtools and bcftools definitions -SAMTOOLS_VERSION=1.8 -BCFTOOLS_VERSION=1.8 +SAMTOOLS_VERSION=1.9 +BCFTOOLS_VERSION=1.9 UCSCTOOLS_VERSION=v378 SAMTOOLS_DIR=samtools BCFTOOLS_DIR=bcftools @@ -67,8 +67,8 @@ _utils: setup: @mkdir -p $(FOLDER_BIN) -bs_call/src/Makefile.mk: bs_call/src/Makefile.mk.in bs_call/GEMTools/Makefile.mk.in bs_call/configure - cd bs_call; ./configure ${BS_CALL_CONFIG} +bs_call/src/Makefile.mk: bs_call/src/Makefile.mk.in bs_call/gt/Makefile.mk.in bs_call/configure _samtools + cd bs_call; ./configure ${BS_CALL_CONFIG} --with-htslib=../${SAMTOOLS_DIR}/htslib-${SAMTOOLS_VERSION} gem3: gem3-mapper/Makefile.mk $(MAKE) --directory=gem3-mapper diff --git a/tools/bs_call b/tools/bs_call index 29c6dd9d..db40bbf3 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 29c6dd9d5a1f83cdd52e8b940d48ed56f0083941 +Subproject commit db40bbf3d96dd2a18b1016c0970d1f10efb7cd3f From abb857fdb49e99810c354f29a698981f32b2d948 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 9 Dec 2019 13:00:03 +0100 Subject: [PATCH 05/61] Move to new bs_call version --- README.md | 6 ++ gemBS/__init__.py | 133 ++++++++++++++++++++++++----------------- gemBS/bsCallReports.py | 4 +- gemBS/database.py | 15 ++++- gemBS/production.py | 51 +++++++++++++--- gemBS/utils.py | 8 +-- gemBS/version.py | 4 +- 7 files changed, 149 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index d3bccd68..88c783c0 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,12 @@ Documentation can be found at Changelog: ---------- + 3.4.0 Move to new bs_call version (2.1.0) which is more efficient + in memory use and can read BAMs and write BCFs natively. + The new bs_call requires a faidx indexed reference, so gemBS + no creates this during indexing. + 3.4.0 Add switches to give more control to threads and memory + usage in mapping and calling stages 3.3.3 Remove legacy pathway for config files with no header line (fix issue 'error in gemBS index #65) 3.3.2 Fix error where header line for wig files could be omitted 3.3.2 Fix generation of non_cpg files diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 695e58c1..fa0a8c77 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -500,7 +500,44 @@ def prepareConfiguration(text_metadata=None,lims_cnag_json=None,configFile=None, with open(jsonOutput, 'w') as of: json.dump(generalDictionary, of, indent=2) -def index(input_name, index_name, extra_fasta_files=None,threads=None,tmpDir=None,sampling_rate=None,nonbs_flag=False): +def mk_gembs_reference(input_name, greference, extra_fasta_files=None, threads=None): + """Create bgzipped copy of reference file(s) in the same directory where + the index(es) are stored. The index files will be made from this, and + this will also serve as the reference for the bs_call command. For this + purpose fai and gzi indexes of the reference will be created. + """ + + output_dir, base = os.path.split(greference) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if not os.path.exists(greference): + gcat = [executables['gemBS_cat'],input_name] + for f in extra_fasta_files: + if not os.path.exists(f): + raise CommandException("Reference file '{}' does not exist".format(f)) + + gcat.extend(extra_fasta_files) + bgzip_bin = executables['bgzip'] + if bgzip_bin == None: + raise CommandException("bgzip binary not found (should be bundled with the gemBS distribution)\n"); + bgzip_command = [bgzip_bin] + if threads != None: + bgzip_command.extend(['-@', str(threads)]); + process = run_tools([gcat,bgzip_command], name='gemBS_cat', output = greference) + if process.wait() != 0: + if os.path.exists(greference): + os.remove(greference) + raise ValueError("Error while making gemBS reference") + + process = run_tools([[executables['samtools'],'faidx',greference]], name='samtools faidx', output = 'greference.fai') + if process.wait() != 0: + for f in [greference + '.fai', greference + '.gzi']: + if os.path.exists(f): + os.remove(f) + raise ValueError("Error while making faidx index of gemBS reference") + +def index(input_name, index_name, greference, extra_fasta_files=None,threads=None,tmpDir=None,sampling_rate=None,nonbs_flag=False): """Run the gem-indexer on the given input. Input has to be the path to a single fasta file that contains the genome to be indexed. Output should be the path to the target index file. Note that @@ -517,29 +554,8 @@ def index(input_name, index_name, extra_fasta_files=None,threads=None,tmpDir=Non output_dir, base = os.path.split(index_name) if not os.path.exists(output_dir): os.makedirs(output_dir) - if extra_fasta_files: - gcat = [executables['gemBS_cat'],input_name] - for f in extra_fasta_files: - if not os.path.exists(f): - raise CommandException("Reference file '{}' does not exist".format(f)) - - gcat.extend(extra_fasta_files) - compress_bin = executables['pigz'] - if compress_bin == None: - compress_bin = executables['gzip'] - if compress_bin == None: - output = index_base + "_gemBS.tmp" - process = run_tools([gcat], name='gemBS_cat', output = output) - else: - output = index_base + "_gemBS.tmp.gz" - process = run_tools([gcat,[compress_bin]], name='gemBS_cat', output = output) - if process.wait() != 0: - if os.path.exists(output): - os.remove(output) - raise ValueError("Error while concatenating input fasta files") - f_in = output - else: - f_in = input_name + + f_in = greference logfile = os.path.join(output_dir,"gem_indexer_" + base + ".err") logging.gemBS.gt("Creating index") @@ -572,8 +588,6 @@ def index(input_name, index_name, extra_fasta_files=None,threads=None,tmpDir=Non os.remove(f) raise ValueError("Error while executing the Bisulphite gem-indexer") - if f_in != input_name: - os.remove(f_in) if index_name != index_base + ".gem": os.rename(index_base + ".gem", index_name) os.rename(index_base + ".info", index_name + ".info") @@ -600,7 +614,7 @@ def dbSNP_index(list_dbSNP_files=[],dbsnp_index=""): tools = [db_snp_index] #Compress pipe - compress_bin = executables['pigz'] + compress_bin = executables['bgzip'] if compress_bin == None: compress_bin = executables['gzip'] if compress_bin != None: @@ -611,7 +625,7 @@ def dbSNP_index(list_dbSNP_files=[],dbsnp_index=""): if process_dbsnp.wait() != 0: if os.path.isfile(dbsnp_index): os.remove(dbsnp_index) - raise ValueError("Error while executing the dbSNP-indexer") + raise ValueError("Error while executing dbSNP-indexer") return os.path.abspath(dbsnp_index) @@ -644,7 +658,8 @@ def makeChromSizes(index_name=None,output=None): def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, read_non_stranded=False,reverse_conv=False,outfile=None, - paired=False,tmpDir="/tmp",threads=1,under_conversion=None, over_conversion=None): + paired=False,tmpDir="/tmp",map_threads=None,sort_threads=None, + sort_memory=None,under_conversion=None, over_conversion=None): """ Start the GEM Bisulfite mapping on the given input. name -- Name basic (FLI) for the input and output fastq files @@ -657,7 +672,9 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, outputDir -- Directory to store the Bisulfite mapping results paired -- Paired End flag tmpDir -- Temporary directory to perform sorting operations - threads -- Number of threads + map_threads -- Number of threads for GEM mapper + sort_threads -- Number of threads for sort operation + sort_memory -- Per thread memory for sort operation under_conversion -- Under conversion sequence over_conversion -- Over conversion sequence """ @@ -674,7 +691,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, mapping.extend(["--i1",inputFiles[0],"--i2",inputFiles[1]]) elif len(inputFiles) == 1: if ftype in ['SAM', 'BAM']: - input_pipe.extend([executables['samtools'],"bam2fq", "--threads", str(threads), inputFiles[0]]) + input_pipe.extend([executables['samtools'],"bam2fq", "--threads", str(map_threads), inputFiles[0]]) elif ftype in ['COMMAND', 'SINGLE_COMMAND', 'PAIRED_COMMAND']: input_pipe.extend(['/bin/sh','-c',inputFiles[0]]) else: @@ -693,7 +710,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, mapping.extend(["--bisulfite-conversion","inferred-C2T-G2A"]) #Number of threads - mapping.extend(["-t",threads]) + mapping.extend(["-t",map_threads]) #Mapping stats report_file = os.path.join(outputDir,"{}.json".format(name)) logfile = os.path.join(outputDir,"gem_mapper_{}.err".format(name)) @@ -719,7 +736,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, readNameClean = [executables['readNameClean']] #BAM SORT - bamSort = [executables['samtools'],"sort","-T",os.path.join(tmpDir,name),"-@",threads,"-o",outfile,"-"] + bamSort = [executables['samtools'],"sort","-T",os.path.join(tmpDir,name),"-@",sort_threads,"-m",sort_memory,"-o",outfile,"-"] tools = [mapping,readNameClean,bamSort] @@ -779,8 +796,8 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/"): class BsCaller: def __init__(self,reference,species,right_trim=0,left_trim=5,keep_unmatched=False, - keep_duplicates=False,ignore_duplicates=False,contig_size=None,dbSNP_index_file="",threads="1", - mapq_threshold=None,bq_threshold=None, + keep_duplicates=False,ignore_duplicates=False,contig_size=None,dbSNP_index_file="", + call_threads="1",merge_threads="1",mapq_threshold=None,bq_threshold=None, haploid=False,conversion=None,ref_bias=None,sample_conversion=None): self.reference = reference self.species = species @@ -790,7 +807,8 @@ def __init__(self,reference,species,right_trim=0,left_trim=5,keep_unmatched=Fals self.keep_duplicates = keep_duplicates self.ignore_duplicates = ignore_duplicates self.dbSNP_index_file = dbSNP_index_file - self.threads = threads + self.call_threads = call_threads + self.merge_threads = merge_threads self.mapq_threshold = mapq_threshold self.bq_threshold = bq_threshold self.haploid = haploid @@ -805,9 +823,6 @@ def prepare(self, sample, input_bam, chrom_list, output_bcf, report_file, contig for chrom in chrom_list: f.write("{}\t0\t{}\n".format(chrom, str(self.contig_size[chrom]))) - samtools = [executables['samtools'],'view','-L',contig_bed,'-h',input_bam] - bsCall = [samtools] - parameters_bscall = ['%s' %(executables["bs_call"]),'-r',self.reference,'-n',sample,'--contig-bed',contig_bed,'--report-file',report_file] parameters_bscall.extend(['--right-trim', str(self.right_trim), '--left-trim', str(self.left_trim)]) @@ -827,17 +842,24 @@ def prepare(self, sample, input_bam, chrom_list, output_bcf, report_file, contig parameters_bscall.extend(['--conversion', self.conversion]) if self.ref_bias != None: parameters_bscall.extend(['--reference-bias', self.ref_bias]) - #Thresholds + # Thresholds if self.mapq_threshold != None: parameters_bscall.extend(['--mapq-threshold', self.mapq_threshold]) if self.bq_threshold != None: parameters_bscall.extend(['--bq-threshold', self.bq_threshold]) + # Threads + parameters_bscall.extend(['-t', self.call_threads]) + # dbSNP if self.dbSNP_index_file: parameters_bscall.extend(['-D', self.dbSNP_index_file]) + # Output + parameters_bscall.extend(['-O', 'b', '-o', output_bcf]); + + # Input BAM file + + parameters_bscall.append(input_bam); - bsCall.append(parameters_bscall) - - bsCall.append([executables['bcftools'],'convert','-o',output_bcf,'-O','b','--threads',self.threads]) + bsCall = [parameters_bscall] return bsCall class MethylationCallIter: @@ -1013,17 +1035,17 @@ def run(self): self.json_commands[desc]=task else: - bsConcat(list_bcfs, sample, fname) - self.lock.acquire() - if self.remove: - self.methIter.finished(list_bcfs, fname) - else: - self.methIter.finished(None, fname) - self.lock.release() + bsConcat(list_bcfs, sample, self.bsCall.merge_threads, fname) + self.lock.acquire() + if self.remove: + self.methIter.finished(list_bcfs, fname) + else: + self.methIter.finished(None, fname) + self.lock.release() def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=None,samples=None,right_trim=0,left_trim=5,dry_run_com=None, - keep_unmatched=False,keep_duplicates=False,dbSNP_index_file="",threads="1",jobs=1,remove=False,concat=False, + keep_unmatched=False,keep_duplicates=False,dbSNP_index_file="",call_threads="1",merge_threads="1",jobs=1,remove=False,concat=False, mapq_threshold=None,bq_threshold=None,haploid=False,conversion=None,ref_bias=None,sample_conversion=None, no_merge=False,json_commands=None,dry_run=False,dry_run_json=None,ignore_db=None,ignore_duplicates=False): @@ -1041,7 +1063,8 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No keep_duplicates -- Do not merge duplicate reads ignore_duplicates -- Ignore duplicate flag from SAM/BAM files dbSNP_index_file -- dbSNP Index File - threads -- Number of threads + call_threads -- Number of threads for calling process + merge_threads -- Number of threads for merging process mapq_threshold -- threshold for MAPQ scores bq_threshold -- threshold for base quality scores haploid -- force genotypes to be homozygous @@ -1073,7 +1096,7 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No bsCall = BsCaller(reference=reference,species=species,right_trim=right_trim,left_trim=left_trim, keep_unmatched=keep_unmatched,keep_duplicates=keep_duplicates,ignore_duplicates=ignore_duplicates,contig_size=contig_size, - dbSNP_index_file=dbSNP_index_file,threads=threads,mapq_threshold=mapq_threshold,bq_threshold=bq_threshold, + dbSNP_index_file=dbSNP_index_file,call_threads=call_threads,merge_threads=merge_threads,mapq_threshold=mapq_threshold,bq_threshold=bq_threshold, haploid=haploid,conversion=conversion,ref_bias=ref_bias,sample_conversion=sample_conversion) if dry_run_com != None: @@ -1233,7 +1256,7 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal return os.path.abspath(output_dir) -def bsConcat(list_bcfs=None,sample=None,bcfSample=None): +def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None): """ Concatenates all bcf methylation calls files in one output file. list_bcfs -- list of bcf files to be concatenated @@ -1252,6 +1275,8 @@ def bsConcat(list_bcfs=None,sample=None,bcfSample=None): #Concatenation concat = [executables['bcftools'],'concat','-O','b','-o',bcfSample] + if threads != None: + concat.extend(['--threads', threads]) concat.extend(list_bcfs) process = run_tools([concat],name="Concatenation Calls",logfile=logfile) diff --git a/gemBS/bsCallReports.py b/gemBS/bsCallReports.py index 8ca96512..04c4c9cb 100644 --- a/gemBS/bsCallReports.py +++ b/gemBS/bsCallReports.py @@ -543,7 +543,7 @@ def buildSampleBscallReport(self,sample, lock = None): summaryMethylation = SummaryMethylation() #Load al json files - for json_file in chrom_json_files: + for json_file in chrom_json_files: with open(json_file, 'r') as file_json: try: data = json.load(file_json) @@ -589,6 +589,8 @@ def buildSampleBscallReport(self,sample, lock = None): nonCpGReadProfile.add(data["totalStats"]["methylation"]["NonCpGreadProfile"]) except ValueError as e: + print('problem reading JSON file {})'.format(json_file)) + print(e) pass # invalid json #Prepare plot for Methylation levels diff --git a/gemBS/database.py b/gemBS/database.py index 1c30f6c2..62f285b5 100644 --- a/gemBS/database.py +++ b/gemBS/database.py @@ -123,7 +123,7 @@ def check_index(self): c = self.cursor() c.execute("REPLACE INTO indexing VALUES (?, 'reference', 1)",(ref,)) - cdef =config['DEFAULT'] + cdef = config['DEFAULT'] index = cdef.get('index', None) nonbs_index = cdef.get('nonbs_index', None) nonbs_flag = cdef.get('nonbs_flag', False) @@ -159,7 +159,16 @@ def check_index(self): elif index.endswith('.gem'): csizes = index[:-3] + 'contig.sizes' else: - csizes = index + '.contig.sizes' + csizes = index + '.contig.sizes' + if index == None: + greference = os.path.join(index_dir, reference_basename) + '.gemBS.ref' + else: + if index.endswith('.BS.gem'): + greference = index[:-6] + 'gemBS.ref' + elif index.endswith('.gem'): + greference = index[:-3] + 'gemBS.ref' + else: + greference = index + '.gemBS.ref' if index == None: index = os.path.join(index_dir, reference_basename) + '.BS.gem' index_ok = 1 if os.path.exists(index) else 0 @@ -180,8 +189,10 @@ def check_index(self): except IOError: nonbs_index_ok = 0 csizes_ok = 1 if os.path.exists(csizes) else 0 + greference_ok = 1 if os.path.exists(greference) and os.path.exists(greference + '.fai') and os.path.exists(greference + '.gzi') else 0 c.execute("REPLACE INTO indexing VALUES (?, 'index', ?)",(index, index_ok)) c.execute("REPLACE INTO indexing VALUES (?, 'contig_sizes', ?)",(csizes,csizes_ok)) + c.execute("REPLACE INTO indexing VALUES (?, 'gembs_reference', ?)",(greference,greference_ok)) if nonbs_index != None: c.execute("REPLACE INTO indexing VALUES (?, 'nonbs_index', ?)",(nonbs_index,nonbs_index_ok)) else: diff --git a/gemBS/production.py b/gemBS/production.py index 8fc9d6cf..814d55fc 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -160,19 +160,30 @@ def run(self, args): index_name, index_ok = db_data['index'] nonbs_index_name, nonbs_index_ok = db_data.get('nonbs_index',(None, 0)) csizes, csizes_ok = db_data['contig_sizes'] + greference, greference_ok = db_data['gembs_reference'] dbsnp_index, dbsnp_ok = db_data.get('dbsnp_idx',(None, 0)) self.threads = jsonData.check(section='index',key='threads',arg=args.threads) args.sampling_rate = jsonData.check(section='index',key='sampling_rate',arg=args.sampling_rate) args.list_dbSNP_files = jsonData.check(section='index',key='dbsnp_files',arg=args.list_dbSNP_files,list_type=True,default=[]) if not fasta_input: raise ValueError('No input reference file specified for Index command') - + if greference_ok == 1: + logging.warning("gemBS reference {} already exists, skipping creation".format(greference)) + else: + ret = mk_gembs_reference(fasta_input, greference, extra_fasta_files=extra_fasta_files, threads=self.threads) + if ret: + self.command = 'mk_gembs_reference' + self.log_parameter() + + logging.gemBS.gt("gemBS reference done: {}".format(greference)) + db.check_index() + if index_ok == 1: logging.warning("Bisulphite Index {} already exists, skipping indexing".format(index_name)) else: self.command = 'index' self.log_parameter() - ret = index(fasta_input, index_name, extra_fasta_files=extra_fasta_files, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) + ret = index(fasta_input, index_name, greference, extra_fasta_files=extra_fasta_files, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) if os.path.exists(csizes): os.remove(csizes) csizes_ok = 0 @@ -250,7 +261,11 @@ def register(self,parser): parser.add_argument('-n', '--sample-name', dest="sample_name", metavar="SAMPLE", help='Name of sample to be mapped.', required=False) parser.add_argument('-b', '--barcode', dest="sample", metavar="BARCODE", help='Barcode of sample to be mapped.', required=False) parser.add_argument('-d', '--tmp-dir', dest="tmp_dir", metavar="PATH", help='Temporary folder to perform sorting operations. Default: /tmp') - parser.add_argument('-t', '--threads', dest="threads", help='Number of threads to perform sorting operations.') + parser.add_argument('-t', '--threads', dest="threads", help='Number of threads for the mapping pipeline. Default: 1'); + parser.add_argument('--map-threads', dest="map_threads", help='Number of threads for GEM mapper. Default: threads',default=None) + parser.add_argument('--sort-threads', dest="sort_threads", help='Number of threads for the sort operations. Default: threads',default=None) + parser.add_argument('--merge-threads', dest="merge_threads", help='Number of threads for the merge operations. Default: threads',default=None) + parser.add_argument('--sort-memory', dest="sort_memory", help='Per thread memory used for the sort operation. Default: 768M',default=None) parser.add_argument('-T', '--type', dest="ftype", help='Type of data file (PAIRED, SINGLE, INTERLEAVED, STREAM, BAM)') parser.add_argument('-p', '--paired-end', dest="paired_end", action="store_true", help="Input data is Paired End") parser.add_argument('-r', '--remove', dest="remove", action="store_true", help='Remove individual BAM files after merging.', required=False) @@ -317,6 +332,10 @@ def run(self, args): self.tmp_dir = self.jsonData.check(section='mapping',key='tmp_dir',arg=args.tmp_dir,dir_type=True) self.threads = self.jsonData.check(section='mapping',key='threads',arg=args.threads,default='1') + self.map_threads = self.jsonData.check(section='mapping',key='map_threads',arg=args.map_threads,default=self.threads) + self.sort_threads = self.jsonData.check(section='mapping',key='sort_threads',arg=args.sort_threads,default=self.threads) + self.merge_threads = self.jsonData.check(section='mapping',key='merge_threads',arg=args.merge_threads,default=self.threads) + self.sort_memory = self.jsonData.check(section='mapping',key='sort_memory',arg=args.sort_memory, default='768M') self.reverse_conv = self.jsonData.check(section='mapping',key='reverse_conversion',arg=args.reverse_conv, boolean=True) self.read_non_stranded = self.jsonData.check(section='mapping',key='non_stranded',arg=args.read_non_stranded, boolean=True) if self.read_non_stranded: @@ -532,6 +551,10 @@ def do_mapping(self, fli): if args.paired_end: com.append('-p') if args.remove: com.append('-r') if args.threads: com.extend(['-t',args.threads]) + if args.map_threads: com.extend(['--map-threads',args.map_threads]) + if args.sort_threads: com.extend(['--sort-threads',args.sort_threads]) + if args.merge_threads: com.extend(['--merge-threads',args.mere_threads]) + if args.sort_memory: com.extend(['--sort-memory',args.sort_memory]) if args.tmp_dir: com.extend(['-d',args.tmp_dir]) if args.read_non_stranded: com.append('-s') if args.reverse_conv: com.append('-R') @@ -560,7 +583,8 @@ def do_mapping(self, fli): ret = mapping(name=fli,index=self.index,fliInfo=fliInfo,inputFiles=inputFiles,ftype=ftype, read_non_stranded=self.read_non_stranded, reverse_conv=self.reverse_conv, - outfile=outfile,paired=self.paired,tmpDir=tmp,threads=self.threads, + outfile=outfile,paired=self.paired,tmpDir=tmp, + map_threads=self.map_threads,sort_threads=self.sort_threads,sort_memory=self.sort_memory, under_conversion=self.underconversion_sequence,over_conversion=self.overconversion_sequence) if ret: @@ -623,7 +647,7 @@ def do_merge(self, sample, inputs, fname): desc = "merge {}".format(smp) self.json_commands[desc] = task else: - ret = merging(inputs = inputs, sample = sample, threads = self.threads, outname = outfile) + ret = merging(inputs = inputs, sample = sample, threads = self.merge_threads, outname = outfile) if ret: logging.gemBS.gt("Merging process done for {}. Output files generated: {}".format(sample, ','.join(ret))) @@ -667,7 +691,7 @@ def do_merge(self, sample, inputs, fname): desc = "merge {}".format(sample) self.json_commands[desc] = task else: - ret = merging(inputs = [], sample = sample, threads = self.threads, outname = fname) + ret = merging(inputs = [], sample = sample, threads = self.merge_threads, outname = fname) if ret: logging.gemBS.gt("Merging process done for {}. Output files generated: {}".format(sample, ','.join(ret))) @@ -726,6 +750,7 @@ def run(self, args): # JSON data self.jsonData = JSONdata(Mapping.gemBS_json) self.threads = self.jsonData.check(section='mapping',key='threads',arg=args.threads,default='1') + self.merge_threads = self.jsonData.check(section='mapping',key='merge_threads',arg=args.threads,default=self.threads) self.remove = self.jsonData.check(section='mapping',key='remove_individual_bams',arg=args.remove, boolean=True) self.dry_run = args.dry_run self.dry_run_json = args.dry_run_json @@ -840,6 +865,8 @@ def register(self, parser): parser.add_argument('-g','--right-trim', dest="right_trim", metavar="BASES",type=int, help='Bases to trim from right of read pair, Default: 0') parser.add_argument('-f','--left-trim', dest="left_trim", metavar="BASES",type=int, help='Bases to trim from left of read pair, Default: 5') parser.add_argument('-t','--threads', dest="threads", metavar="THREADS", help='Number of threads, Default: %s' %self.threads) + parser.add_argument('--call-threads', dest="call_threads", metavar="THREADS", help='Number of threads for calling process, Default: 1s') + parser.add_argument('--merge-threads', dest="merge_threads", metavar="THREADS", help='Number of threads for merging process, Default: threads') parser.add_argument('-j','--jobs', dest="jobs", type=int, help='Number of parallel jobs') parser.add_argument('-u','--keep-duplicates', dest="keep_duplicates", action="store_true", help="Do not merge duplicate reads.") parser.add_argument('-U','--ignore_duplicate_flag', dest="ignore_duplicates", action="store_true", help="Ignore duplicate flag from SAM/BAM files.") @@ -874,6 +901,8 @@ def run(self,args): return self.threads = self.jsonData.check(section='calling',key='threads',arg=args.threads,default='1') + self.call_threads = self.jsonData.check(section='calling',key='call_threads',arg=args.threads,default=self.threads) + self.merge_threads = self.jsonData.check(section='calling',key='merge_threads',arg=args.threads,default=self.threads) self.jobs = self.jsonData.check(section='calling',key='jobs',arg=args.jobs,default=1,int_type=True) self.mapq_threshold = self.jsonData.check(section='calling',key='mapq_threshold',arg=args.mapq_threshold) self.qual_threshold = self.jsonData.check(section='calling',key='qual_threshold',arg=args.qual_threshold) @@ -988,9 +1017,9 @@ def run(self,args): # Get fasta reference && dbSNP index if supplied self.dbSNP_index_file = None for fname, ftype, status in c.execute("SELECT * FROM indexing"): - if ftype == 'reference': + if ftype == 'gembs_reference': if status != 1: - raise CommandException("Fasta reference {} not found. Run 'gemBS index' or correct configuration file and rerun".format(fname)) + raise CommandException("gemBS reference {} not found. Run 'gemBS index' or correct configuration file and rerun".format(fname)) else: self.fasta_reference = fname elif ftype == 'dbsnp_idx': @@ -1102,6 +1131,8 @@ def run(self,args): com.extend(['-j',Mapping.gemBS_json]) com1 = [] if args.threads != None: com1.extend(['-t',args.threads]) + if args.call_threads != None: com1.extend(['--call-threads',args.call_threads]) + if args.merge_threads != None: com1.extend(['--merge-threads',args.merge_threads]) if args.remove != None: com1.append('-r') com2 = [] if args.mapq_threshold != None: com2.extend(['-q',str(args.mapq_threshold)]) @@ -1129,7 +1160,7 @@ def run(self,args): sample_bam=self.sampleBam,output_bcf=self.outputBcf,remove=self.remove,dry_run=self.dry_run, keep_unmatched=self.keep_unmatched,samples=self.samples,dry_run_com=dry_run_com, keep_duplicates=self.keep_duplicates,ignore_duplicates=self.ignore_duplicates, - dbSNP_index_file=self.dbSNP_index_file,threads=self.threads,jobs=self.jobs, + dbSNP_index_file=self.dbSNP_index_file,call_threads=self.call_threads,merge_threads=self.merge_threads,jobs=self.jobs, mapq_threshold=self.mapq_threshold,bq_threshold=self.qual_threshold,dry_run_json=self.dry_run_json, haploid=self.haploid,conversion=self.conversion,ref_bias=self.ref_bias,sample_conversion=self.sample_conversion) @@ -1186,6 +1217,7 @@ def register(self,parser): parser.add_argument('-n', '--sample-name',dest="sample_name",metavar="SAMPLE",help="Nmae of sample to be merged",required=False) parser.add_argument('-b', '--sample-barcode',dest="sample",metavar="BARCODE",help="Barcode of sample to be merged",required=False) parser.add_argument('-t', '--threads', dest="threads", metavar="THREADS", help='Number of threads') + parser.add_argument('--merge-threads', dest="merge_threads", metavar="THREADS", help='Number of threads for merge step') parser.add_argument('-r', '--remove', dest="remove", action="store_true", help='Remove individual BAM files after merging.', required=False) parser.add_argument('-j', '--jobs', dest="jobs", type=int, help='Number of parallel jobs') parser.add_argument('--dry-run', dest="dry_run", action="store_true", help="Output mapping commands without execution") @@ -1211,6 +1243,7 @@ def run(self,args): args.dbSNP_index_file = None args.pool = None args.list_pools = 0 + args.call_threads = None args.no_merge = False MethylationCall.run(self, args) diff --git a/gemBS/utils.py b/gemBS/utils.py index 7af748f7..de200894 100644 --- a/gemBS/utils.py +++ b/gemBS/utils.py @@ -149,7 +149,7 @@ def wait(self): # wait for the process exit_value = self.process.wait() logging.debug("Process '%s' finished with %d", str(self), exit_value) - if exit_value is not 0: + if exit_value != 0: logging.error("Process '%s' finished with %d", str(self), exit_value) if self.logfile is not None and isinstance(self.logfile, str): with open(self.logfile) as f: @@ -257,10 +257,10 @@ def wait(self): exit_value = 0 for process in reversed(self.processes): ev = process.wait() - if ev is not 0: + if ev != 0: exit_value = ev self.exit_value = exit_value - if exit_value is not 0: + if exit_value != 0: return exit_value except: self.exit_value = 1 @@ -287,7 +287,7 @@ def _prepare_output(output): if isinstance(output, str): return output if isinstance(output, IOBase): - if output.name is not None and output.name is not "": + if output.name is not None and output.name != "": output.close() return output.name raise ProcessError("Can not pass raw file descriptors") diff --git a/gemBS/version.py b/gemBS/version.py index 24a79190..2d5cc7fb 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" -__VERSION_MINOR = "3" -__VERSION_SUBMINOR = "3" +__VERSION_MINOR = "4" +__VERSION_SUBMINOR = "0" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) From 1f0fbef22caeba32212764769cf6e7056fd671e9 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 9 Dec 2019 16:15:21 +0100 Subject: [PATCH 06/61] Set correct location for bundled htslib --- tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/Makefile b/tools/Makefile index 0588f3d0..78896984 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -68,7 +68,7 @@ setup: @mkdir -p $(FOLDER_BIN) bs_call/src/Makefile.mk: bs_call/src/Makefile.mk.in bs_call/gt/Makefile.mk.in bs_call/configure _samtools - cd bs_call; ./configure ${BS_CALL_CONFIG} --with-htslib=../${SAMTOOLS_DIR}/htslib-${SAMTOOLS_VERSION} + cd bs_call; ./configure ${BS_CALL_CONFIG} --with-htslib=../../${SAMTOOLS_DIR}/htslib-${SAMTOOLS_VERSION} gem3: gem3-mapper/Makefile.mk $(MAKE) --directory=gem3-mapper From d10f4b8a0be662fb10e9e52b70957e4222082dd1 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Tue, 10 Dec 2019 07:37:39 +0100 Subject: [PATCH 07/61] pull in bug fix --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index db40bbf3..60dc5014 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit db40bbf3d96dd2a18b1016c0970d1f10efb7cd3f +Subproject commit 60dc50147bdd040364971afd40d927b34700cca7 From c6c3b8689cad26566f80524f26700c00448a3080 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 11 Dec 2019 12:12:38 +0100 Subject: [PATCH 08/61] Switch to samtools, bcftools and htslib v1.10 --- tools/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index 78896984..575863c9 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -10,8 +10,8 @@ ROOT_PATH=$(CURDIR) # samtools and bcftools definitions -SAMTOOLS_VERSION=1.9 -BCFTOOLS_VERSION=1.9 +SAMTOOLS_VERSION=1.10 +BCFTOOLS_VERSION=1.10 UCSCTOOLS_VERSION=v378 SAMTOOLS_DIR=samtools BCFTOOLS_DIR=bcftools From 346796842da7d7f5851c4da90f289a20e90c04b9 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 11 Dec 2019 12:14:41 +0100 Subject: [PATCH 09/61] Add new version with benchmark-mode --- tools/gem3-mapper | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/gem3-mapper b/tools/gem3-mapper index 99021a6b..1586839f 160000 --- a/tools/gem3-mapper +++ b/tools/gem3-mapper @@ -1 +1 @@ -Subproject commit 99021a6b3f7bf3eaa1904b9a4c92efef61dea455 +Subproject commit 1586839f638cdae901a3ef0a4b289c77a7234020 From c92575502f400230d86a794711d2ccfe90fdd476 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 11 Dec 2019 12:15:38 +0100 Subject: [PATCH 10/61] Add new version with benchmark-mode and using htslib v1.10 --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index 60dc5014..eee11f42 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 60dc50147bdd040364971afd40d927b34700cca7 +Subproject commit eee11f420f664b01c06d232ee40e1d7c019f5916 From 7867ac3c656efa60dde1bdf25bd0fc1efd32533e Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 11 Dec 2019 12:16:23 +0100 Subject: [PATCH 11/61] Add benchmark-mode --- gemBS/__init__.py | 131 ++++++++++++++++++++++++----------- gemBS/parser.py | 4 +- gemBS/production.py | 18 +++-- tools/gemBS_plugins/output.c | 12 ++-- tools/gemBS_plugins/snpxtr.c | 2 +- 5 files changed, 111 insertions(+), 56 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index fa0a8c77..1f572bc4 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -487,7 +487,7 @@ def prepareConfiguration(text_metadata=None,lims_cnag_json=None,configFile=None, db.close() printer = logging.gemBS.gt miss_flag = False - for x in ('Reference','Index','Contig_sizes','NonBS_Index','dbSNP_idx'): + for x in ('Reference','Index','gemBS_Reference','Contig_sizes','NonBS_Index','dbSNP_idx'): v = ix_files.get(x.lower()) if v and v[1] != 1: printer("{} file '{}': Missing".format(x, v[0])) @@ -500,10 +500,23 @@ def prepareConfiguration(text_metadata=None,lims_cnag_json=None,configFile=None, with open(jsonOutput, 'w') as of: json.dump(generalDictionary, of, indent=2) + """Check if file (assumed to exist) is BGZIPPED by checking for magic numbers in the + first 16 bytes of the file (according to BGZIP specifications) + """ +def file_bgzipped(file_name): + a = b'\x1f\x8b\x08\x04' + b = b'\x06\x00\x42\x43\x02\x00' + ret = False + with open(file_name, "rb") as f: + st = f.read(16) + ret = (len(st) == 16 and st[0:4] == a and st[10:16] == b) + return(ret) + def mk_gembs_reference(input_name, greference, extra_fasta_files=None, threads=None): """Create bgzipped copy of reference file(s) in the same directory where - the index(es) are stored. The index files will be made from this, and - this will also serve as the reference for the bs_call command. For this + the index(es) are stored. If the supplied reference is already bgzipped then we + simply make a symbolic link to the existing reference. + This file will serve as the reference for the bs_call command, and for this purpose fai and gzi indexes of the reference will be created. """ @@ -512,23 +525,21 @@ def mk_gembs_reference(input_name, greference, extra_fasta_files=None, threads=N os.makedirs(output_dir) if not os.path.exists(greference): - gcat = [executables['gemBS_cat'],input_name] - for f in extra_fasta_files: - if not os.path.exists(f): - raise CommandException("Reference file '{}' does not exist".format(f)) - - gcat.extend(extra_fasta_files) - bgzip_bin = executables['bgzip'] - if bgzip_bin == None: - raise CommandException("bgzip binary not found (should be bundled with the gemBS distribution)\n"); - bgzip_command = [bgzip_bin] - if threads != None: - bgzip_command.extend(['-@', str(threads)]); - process = run_tools([gcat,bgzip_command], name='gemBS_cat', output = greference) - if process.wait() != 0: - if os.path.exists(greference): - os.remove(greference) - raise ValueError("Error while making gemBS reference") + if file_bgzipped(input_name): + os.symlink(os.path.abspath(input_name), greference) + else : + gcat = [executables['gemBS_cat'],input_name] + bgzip_bin = executables['bgzip'] + if bgzip_bin == None: + raise CommandException("bgzip binary not found (should be bundled with the gemBS distribution)\n"); + bgzip_command = [bgzip_bin] + if threads != None: + bgzip_command.extend(['-@', str(threads)]); + process = run_tools([gcat,bgzip_command], name='gemBS_cat', output = greference) + if process.wait() != 0: + if os.path.exists(greference): + os.remove(greference) + raise ValueError("Error while making gemBS reference") process = run_tools([[executables['samtools'],'faidx',greference]], name='samtools faidx', output = 'greference.fai') if process.wait() != 0: @@ -537,7 +548,7 @@ def mk_gembs_reference(input_name, greference, extra_fasta_files=None, threads=N os.remove(f) raise ValueError("Error while making faidx index of gemBS reference") -def index(input_name, index_name, greference, extra_fasta_files=None,threads=None,tmpDir=None,sampling_rate=None,nonbs_flag=False): +def index(input_name, index_name, extra_fasta_files=None,threads=None,tmpDir=None,sampling_rate=None,nonbs_flag=False): """Run the gem-indexer on the given input. Input has to be the path to a single fasta file that contains the genome to be indexed. Output should be the path to the target index file. Note that @@ -554,8 +565,22 @@ def index(input_name, index_name, greference, extra_fasta_files=None,threads=Non output_dir, base = os.path.split(index_name) if not os.path.exists(output_dir): os.makedirs(output_dir) - - f_in = greference + if extra_fasta_files: + gcat = [executables['gemBS_cat'],input_name] + for f in extra_fasta_files: + if not os.path.exists(f): + raise CommandException("Reference file '{}' does not exist".format(f)) + + gcat.extend(extra_fasta_files) + output = index_base + "_gemBS.tmp.gz" + process = run_tools([gcat,['pigz']], name='gemBS_cat', output = output) + if process.wait() != 0: + if os.path.exists(output): + os.remove(output) + raise ValueError("Error while concatenating input fasta files") + f_in = output + else: + f_in = input_name logfile = os.path.join(output_dir,"gem_indexer_" + base + ".err") logging.gemBS.gt("Creating index") @@ -587,7 +612,10 @@ def index(input_name, index_name, greference, extra_fasta_files=None,threads=Non if os.path.exists(f) and f != input_name: os.remove(f) raise ValueError("Error while executing the Bisulphite gem-indexer") - + + if f_in != input_name: + os.remove(f_in) + if index_name != index_base + ".gem": os.rename(index_base + ".gem", index_name) os.rename(index_base + ".info", index_name + ".info") @@ -656,10 +684,10 @@ def makeChromSizes(index_name=None,output=None): else: raise ValueError("Info file {} (normally generated by gem-indexer) does not exist".format(info_file)) -def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, +def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetype=None, read_non_stranded=False,reverse_conv=False,outfile=None, paired=False,tmpDir="/tmp",map_threads=None,sort_threads=None, - sort_memory=None,under_conversion=None, over_conversion=None): + sort_memory=None,under_conversion=None, over_conversion=None, benchmark_mode=False): """ Start the GEM Bisulfite mapping on the given input. name -- Name basic (FLI) for the input and output fastq files @@ -667,6 +695,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, fliInfo -- FLI object with metadata information (useful for read groups) inputFiles -- List of input files ftype -- input file type + filetype -- output file type read_non_stranded -- Read non stranded reverse_conv - Reverse the normal conversion outputDir -- Directory to store the Bisulfite mapping results @@ -677,6 +706,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, sort_memory -- Per thread memory for sort operation under_conversion -- Under conversion sequence over_conversion -- Over conversion sequence + benchmark_mode -- Remove times etc. from output files to simplify file comparisons """ ## prepare the input input_pipe = [] @@ -708,7 +738,9 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, mapping.extend(["--bisulfite-conversion","inferred-G2A-C2T"]) else: mapping.extend(["--bisulfite-conversion","inferred-C2T-G2A"]) - + #Benchmark mode + if benchmark_mode: + mapping.append("--benchmark-mode") #Number of threads mapping.extend(["-t",map_threads]) #Mapping stats @@ -736,7 +768,12 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, readNameClean = [executables['readNameClean']] #BAM SORT - bamSort = [executables['samtools'],"sort","-T",os.path.join(tmpDir,name),"-@",sort_threads,"-m",sort_memory,"-o",outfile,"-"] + bamSort = [executables['samtools'],"sort","-T",os.path.join(tmpDir,name),"-@",sort_threads,"-m",sort_memory,"-o",outfile] + if filetype == 'SINGLE_BAM': + bamSort.append("--write-index") + if benchmark_mode: + bamSort.append("--no-PG") + bamSort.append("-"); tools = [mapping,readNameClean,bamSort] @@ -747,7 +784,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None, return os.path.abspath("%s" % outfile) -def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/"): +def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benchmark_mode=False): """ Merge bam alignment files inputs -- Dictionary of samples and bam list files inputs(Key=sample, Value = [bam1,...,bamN]) @@ -771,7 +808,10 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/"): return_info = [] if inputs: - bammerging.extend([executables['samtools'],"merge","--threads",threads,"-f",bam_filename]) + bammerging.extend([executables['samtools'],"merge","--threads",threads,"--write-index"]) + if benchmark_mode: + bammerging.append("--no-PG") + bammerging.extend(["-f",bam_filename]) for bamFile in inputs: bammerging.append(bamFile) logfile = os.path.join(output,"bam_merge_{}.err".format(sample)) @@ -780,13 +820,13 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/"): return_info.append(os.path.abspath(bam_filename)) #Samtools index - logfile = os.path.join(output,"bam_index_{}.err".format(sample)) - indexing = [executables['samtools'], "index", "-@", threads, bam_filename, index_filename] +# logfile = os.path.join(output,"bam_index_{}.err".format(sample)) +# indexing = [executables['samtools'], "index", "-@", threads, bam_filename, index_filename] md5sum = ['md5sum',bam_filename] - processIndex = run_tools([indexing],name="Indexing",logfile=logfile) +# processIndex = run_tools([indexing],name="Indexing",logfile=logfile) processMD5 = run_tools([md5sum],name="BAM MD5",output=md5_filename) - if processIndex.wait() != 0: - raise ValueError("Error while indexing BAM file.") +# if processIndex.wait() != 0: +# raise ValueError("Error while indexing BAM file.") if processMD5.wait() != 0: raise ValueError("Error while calculating md5sum of BAM file.") @@ -798,7 +838,7 @@ class BsCaller: def __init__(self,reference,species,right_trim=0,left_trim=5,keep_unmatched=False, keep_duplicates=False,ignore_duplicates=False,contig_size=None,dbSNP_index_file="", call_threads="1",merge_threads="1",mapq_threshold=None,bq_threshold=None, - haploid=False,conversion=None,ref_bias=None,sample_conversion=None): + haploid=False,conversion=None,ref_bias=None,sample_conversion=None,benchmark_mode=False): self.reference = reference self.species = species self.right_trim = right_trim @@ -816,6 +856,7 @@ def __init__(self,reference,species,right_trim=0,left_trim=5,keep_unmatched=Fals self.ref_bias = ref_bias self.sample_conversion = sample_conversion self.contig_size = contig_size + self.benchmark_mode = benchmark_mode def prepare(self, sample, input_bam, chrom_list, output_bcf, report_file, contig_bed): @@ -833,6 +874,8 @@ def prepare(self, sample, input_bam, chrom_list, output_bcf, report_file, contig parameters_bscall.append('-d') if self.ignore_duplicates: parameters_bscall.append('--ignore-duplicates') + if self.benchmark_mode: + parameters_bscall.append('--benchmark-mode') if self.haploid: parameters_bscall.append('-1') if self.conversion != None: @@ -950,7 +993,7 @@ def finished(self, bcf_list, fname): db.close() class MethylationCallThread(th.Thread): - def __init__(self, threadID, methIter, bsCall, lock, remove, dry_run_com, dry_run, dry_run_json, json_commands, conversion, sample_conversion): + def __init__(self, threadID, methIter, bsCall, lock, remove, dry_run_com, dry_run, dry_run_json, json_commands, conversion, sample_conversion, benchmark_mode): th.Thread.__init__(self) self.threadID = threadID self.methIter = methIter @@ -963,6 +1006,7 @@ def __init__(self, threadID, methIter, bsCall, lock, remove, dry_run_com, dry_ru self.json_commands = json_commands self.conversion = conversion self.sample_conversion = sample_conversion + self.benchmark_mode = benchmark_mode def run(self): while True: @@ -1035,7 +1079,7 @@ def run(self): self.json_commands[desc]=task else: - bsConcat(list_bcfs, sample, self.bsCall.merge_threads, fname) + bsConcat(list_bcfs, sample, self.bsCall.merge_threads, fname, self.benchmark_mode) self.lock.acquire() if self.remove: self.methIter.finished(list_bcfs, fname) @@ -1047,7 +1091,7 @@ def run(self): def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=None,samples=None,right_trim=0,left_trim=5,dry_run_com=None, keep_unmatched=False,keep_duplicates=False,dbSNP_index_file="",call_threads="1",merge_threads="1",jobs=1,remove=False,concat=False, mapq_threshold=None,bq_threshold=None,haploid=False,conversion=None,ref_bias=None,sample_conversion=None, - no_merge=False,json_commands=None,dry_run=False,dry_run_json=None,ignore_db=None,ignore_duplicates=False): + no_merge=False,json_commands=None,dry_run=False,dry_run_json=None,ignore_db=None,ignore_duplicates=False,benchmark_mode=False): """ Performs the process to make met5Bhylation calls. @@ -1072,6 +1116,7 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No remove -- remove individual BCF files after merging ref_bias -- bias to reference homozygote sample_conversion - per sample conversion rates (calculated if conversion == 'auto') + benchmark_mode - remove version and date information from header """ for snp, pl in output_bcf.items(): @@ -1097,7 +1142,7 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No bsCall = BsCaller(reference=reference,species=species,right_trim=right_trim,left_trim=left_trim, keep_unmatched=keep_unmatched,keep_duplicates=keep_duplicates,ignore_duplicates=ignore_duplicates,contig_size=contig_size, dbSNP_index_file=dbSNP_index_file,call_threads=call_threads,merge_threads=merge_threads,mapq_threshold=mapq_threshold,bq_threshold=bq_threshold, - haploid=haploid,conversion=conversion,ref_bias=ref_bias,sample_conversion=sample_conversion) + haploid=haploid,conversion=conversion,ref_bias=ref_bias,sample_conversion=sample_conversion,benchmark_mode=benchmark_mode) if dry_run_com != None: jobs = 1 @@ -1107,7 +1152,7 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No if jobs < 1: jobs = 1 thread_list = [] for ix in range(jobs): - thread = MethylationCallThread(ix, methIter, bsCall, lock, remove, dry_run_com, dry_run, dry_run_json, json_commands, conversion, sample_conversion) + thread = MethylationCallThread(ix, methIter, bsCall, lock, remove, dry_run_com, dry_run, dry_run_json, json_commands, conversion, sample_conversion, benchmark_mode) thread.start() thread_list.append(thread) for thread in thread_list: @@ -1256,7 +1301,7 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal return os.path.abspath(output_dir) -def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None): +def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None,benchmark_mode=False): """ Concatenates all bcf methylation calls files in one output file. list_bcfs -- list of bcf files to be concatenated @@ -1277,6 +1322,8 @@ def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None): concat = [executables['bcftools'],'concat','-O','b','-o',bcfSample] if threads != None: concat.extend(['--threads', threads]) + if benchmark_mode: + concat.append('--no-version') concat.extend(list_bcfs) process = run_tools([concat],name="Concatenation Calls",logfile=logfile) diff --git a/gemBS/parser.py b/gemBS/parser.py index 19cec773..96b07c5e 100755 --- a/gemBS/parser.py +++ b/gemBS/parser.py @@ -133,10 +133,10 @@ def read(self, infile): state = 0 known_var = { - 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'reverse_conversion', 'remove_individual_bams', 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir'), + 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'reverse_conversion', 'remove_individual_bams', 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir', 'benchmark_mode'), 'index': ('index', 'index_dir', 'reference', 'extra_references', 'reference_basename', 'nonbs_index', 'contig_sizes', 'threads', 'dbsnp_files', 'dbsnp_index', 'sampling_rate'), 'calling': ('bcf_dir', 'mapq_threshold', 'qual_threshold', 'left_trim', 'right_trim', 'threads', 'jobs', 'species', 'keep_duplicates', 'keep_improper_pairs', - 'remove_individual_bcfs', 'haploid', 'reference_bias', 'conversion', 'contig_list', 'contig_pool_limit'), + 'remove_individual_bcfs', 'haploid', 'reference_bias', 'conversion', 'contig_list', 'contig_pool_limit', 'benchmark_mode'), 'extract': ('extract_dir', 'jobs', 'allow_het', 'phred_threshold', 'min_inform', 'strand_specific', 'min_bc', 'make_cpg', 'make_non_cpg', 'make_bedmethyl', 'make_bigwig', 'make_snps', 'snp_list', 'snp_db', 'reference_bias'), 'report': ('project', 'report_dir', 'threads') diff --git a/gemBS/production.py b/gemBS/production.py index 814d55fc..5386d030 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -183,7 +183,7 @@ def run(self, args): self.command = 'index' self.log_parameter() - ret = index(fasta_input, index_name, greference, extra_fasta_files=extra_fasta_files, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) + ret = index(fasta_input, index_name, extra_fasta_files=extra_fasta_files, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) if os.path.exists(csizes): os.remove(csizes) csizes_ok = 0 @@ -279,6 +279,7 @@ def register(self,parser): parser.add_argument('--dry-run', dest="dry_run", action="store_true", help="Output mapping commands without execution") parser.add_argument('--json', dest="dry_run_json",metavar="JSON FILE",help="Output JSON file with details of pending commands") parser.add_argument('--ignore-db', dest="ignore_db", action="store_true",help="Ignore database for --dry-run and --json commands") + parser.add_argument('--benchmark-mode', dest="benchmark_mode", action="store_true",help="Omit dates etc. to make file comparison simpler", required=False) def run(self, args): self.all_types = ['PAIRED', 'INTERLEAVED', 'SINGLE', 'BAM', 'SAM', 'STREAM', 'PAIRED_STREAM', 'SINGLE_STREAM', 'COMMAND', 'SINGLE_COMMAND', 'PAIRED_COMMAND'] @@ -337,6 +338,7 @@ def run(self, args): self.merge_threads = self.jsonData.check(section='mapping',key='merge_threads',arg=args.merge_threads,default=self.threads) self.sort_memory = self.jsonData.check(section='mapping',key='sort_memory',arg=args.sort_memory, default='768M') self.reverse_conv = self.jsonData.check(section='mapping',key='reverse_conversion',arg=args.reverse_conv, boolean=True) + self.benchmark_mode = self.jsonData.check(section='mapping',key='benchmark_mode',arg=args.benchmark_mode, boolean=True) self.read_non_stranded = self.jsonData.check(section='mapping',key='non_stranded',arg=args.read_non_stranded, boolean=True) if self.read_non_stranded: self.reverse_conv = False @@ -558,6 +560,7 @@ def do_mapping(self, fli): if args.tmp_dir: com.extend(['-d',args.tmp_dir]) if args.read_non_stranded: com.append('-s') if args.reverse_conv: com.append('-R') + if args.benchmark_mode: com.append('--benchmark-mode') if args.underconversion_sequence: com.extend(['-u',args.underconversion_sequence]) if args.overconversion_sequence: com.extend(['-v',args.overconversion_sequence]) if not bis: com.append('--non-bs') @@ -581,11 +584,12 @@ def do_mapping(self, fli): if not tmp: tmp = os.path.dirname(outfile) - ret = mapping(name=fli,index=self.index,fliInfo=fliInfo,inputFiles=inputFiles,ftype=ftype, + ret = mapping(name=fli,index=self.index,fliInfo=fliInfo,inputFiles=inputFiles,ftype=ftype,filetype=filetype, read_non_stranded=self.read_non_stranded, reverse_conv=self.reverse_conv, outfile=outfile,paired=self.paired,tmpDir=tmp, map_threads=self.map_threads,sort_threads=self.sort_threads,sort_memory=self.sort_memory, - under_conversion=self.underconversion_sequence,over_conversion=self.overconversion_sequence) + under_conversion=self.underconversion_sequence,over_conversion=self.overconversion_sequence, + benchmark_mode=self.benchmark_mode) if ret: logging.gemBS.gt("Bisulfite Mapping done. Output File: %s" %(ret)) @@ -647,7 +651,7 @@ def do_merge(self, sample, inputs, fname): desc = "merge {}".format(smp) self.json_commands[desc] = task else: - ret = merging(inputs = inputs, sample = sample, threads = self.merge_threads, outname = outfile) + ret = merging(inputs = inputs, sample = sample, threads = self.merge_threads, outname = outfile, benchmark_mode=self.benchmark_mode) if ret: logging.gemBS.gt("Merging process done for {}. Output files generated: {}".format(sample, ','.join(ret))) @@ -884,6 +888,7 @@ def register(self, parser): parser.add_argument('--json', dest="dry_run_json",metavar="JSON FILE",help="Output JSON file with details of pending commands") parser.add_argument('--ignore-db', dest="ignore_db", action="store_true",help="Ignore database for --dry-run and --json commands") parser.add_argument('--ignore-dep', dest="ignore_dep", action="store_true",help="Ignore dependencies for --dry-run and --json commands") + parser.add_argument('--benchmark-mode', dest="benchmark_mode", action="store_true",help="Omit dates etc. to make file comparison simpler", required=False) def run(self,args): self.command = 'call' @@ -912,6 +917,7 @@ def run(self,args): self.keep_unmatched = self.jsonData.check(section='calling',key='keep_improper_pairs',arg=args.keep_unmatched,boolean=True) self.keep_duplicates = self.jsonData.check(section='calling',key='keep_duplicates',arg=args.keep_duplicates,boolean=True) self.ignore_duplicates = self.jsonData.check(section='calling',key='ignore_duplicate_flag',arg=args.keep_duplicates,boolean=True) + self.benchmark_mode = self.jsonData.check(section='calling',key='benchmark_mode',arg=args.benchmark_mode, boolean=True) self.haploid = self.jsonData.check(section='calling',key='haploid',arg=args.haploid,boolean=True) self.species = self.jsonData.check(section='calling',key='species',arg=args.species) self.contig_list = self.jsonData.check(section='calling',key='contig_list',arg=args.contig_list,list_type=True, default = []) @@ -1145,6 +1151,7 @@ def run(self,args): if args.keep_unmatched != None: com2.append('-k') if args.haploid != None: com2.append('--haploid') if args.species != None: com2.append('--species') + if args.benchmark_mode: com.append('--benchmark-mode') if args.ref_bias != None: com2.extend(['-B',args.ref_bias]) dry_run_com = [com, com1, com2] @@ -1162,7 +1169,8 @@ def run(self,args): keep_duplicates=self.keep_duplicates,ignore_duplicates=self.ignore_duplicates, dbSNP_index_file=self.dbSNP_index_file,call_threads=self.call_threads,merge_threads=self.merge_threads,jobs=self.jobs, mapq_threshold=self.mapq_threshold,bq_threshold=self.qual_threshold,dry_run_json=self.dry_run_json, - haploid=self.haploid,conversion=self.conversion,ref_bias=self.ref_bias,sample_conversion=self.sample_conversion) + haploid=self.haploid,conversion=self.conversion,ref_bias=self.ref_bias,sample_conversion=self.sample_conversion, + benchmark_mode=self.benchmark_mode) if ret and not (self.dry_run or self.dry_run_json): if args.concat: diff --git a/tools/gemBS_plugins/output.c b/tools/gemBS_plugins/output.c index eea81a65..e8548d5a 100644 --- a/tools/gemBS_plugins/output.c +++ b/tools/gemBS_plugins/output.c @@ -80,7 +80,7 @@ void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt int cx_sz = tags[FMT_CX].st[idx].ne / ns; if(args->mode == CPGMODE_COMBINED) { calc_cpg_meth(args, ns, cpg, sample_gt[idx], sample_gt[idx ^ 1]); - fprintf(fp,"%s\t%d\t%d\t%.2s", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 2, cx_len >= 5 ? cx + 2 : "."); + fprintf(fp,"%s\t%" PRId64 "\t%" PRId64 "\t%.2s", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 2, cx_len >= 5 ? cx + 2 : "."); char *cx_p = tags[FMT_CX].st[idx].dat_p; int *mq_p1 = tags[FMT_MQ].st[idx].ne == ns ? tags[FMT_MQ].st[idx].dat_p : NULL; int *mq_p2 = tags[FMT_MQ].st[idx^1].ne == ns ? tags[FMT_MQ].st[idx^1].dat_p : NULL; @@ -130,7 +130,7 @@ void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt } else { for(int pos = 0; pos < 2; pos++) { int *mq_p = tags[FMT_MQ].st[idx ^ pos].ne == ns ? tags[FMT_MQ].st[idx ^ pos].dat_p : NULL; - fprintf(fp,"%s\t%d\t%d\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); + fprintf(fp,"%s\t%" PRId64 "\t%" PRId64 " \t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); char *cx_p = tags[FMT_CX].st[idx].dat_p; for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { gt_meth *g = sample_gt[idx ^ pos]+ix; @@ -204,7 +204,7 @@ void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)&cx, &cx_n, BCF_HT_STR); int cx_sz = tags[FMT_CX].st[idx ^ pos].ne / ns; int *mq_p = tags[FMT_MQ].st[idx ^ pos].ne == ns ? tags[FMT_MQ].st[idx ^ pos].dat_p : NULL; - fprintf(fp,"%s\t%d\t%d\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); + fprintf(fp,"%s\t%"PRId64"\t%"PRId64"\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); char *cx_p = tags[FMT_CX].st[idx ^ pos].dat_p; for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { gt_meth *g = sample_gt[idx ^ pos] + ix; @@ -293,7 +293,7 @@ void output_nonconsec_noncpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_me int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)&cx, &cx_n, BCF_HT_STR); int cx_sz = tags[FMT_CX].st[idx].ne / ns; int *mq_p = tags[FMT_MQ].st[idx].ne == ns ? tags[FMT_MQ].st[idx].dat_p : NULL; - fprintf(fp,"%s\t%d\t%d\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, cx_len >= 3 ? cx[2] : '.'); + fprintf(fp,"%s\t%"PRId64"\t%"PRId64"\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, cx_len >= 3 ? cx[2] : '.'); char *cx_p = tags[FMT_CX].st[idx].dat_p; for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { gt_meth *g = sample_gt[idx] + ix; @@ -391,14 +391,14 @@ void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sam if(rec->rid != old_rid) { fprintf(fp, "variableStep chrom=%s\n", args->hdr->id[BCF_DT_CTG][rec->rid].key); } - fprintf(fp, "%u\t%.4g\n", rec->pos + 1, 100.0 * m); + fprintf(fp, "%"PRId64"\t%.4g\n", rec->pos + 1, 100.0 * m); } old_rid = rec->rid; } FILE *fp = args->bedmethylfiles[btype]; if(fp != NULL) { int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype - fprintf(fp, "%s\t%u\t%u\t\"%s\"\t%d\t%c\t%u\t%u\t%s\t%d\t%d\t%s\t%s\t%d\n", + fprintf(fp, "%s\t%"PRId64"\t%"PRId64"\t\"%s\"\t%d\t%c\t%"PRId64"\t%"PRId64"\t%s\t%d\t%d\t%s\t%s\t%d\n", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, args->bedmethyl_desc, cov > 1000 ? 1000 : cov, strand, rec->pos, rec->pos + 1, rgb_tab[(int)(m * 10.0 + 0.5)], cov, (int)(100.0 * m), rtmp, rtmp + 4, gq); } diff --git a/tools/gemBS_plugins/snpxtr.c b/tools/gemBS_plugins/snpxtr.c index fa4b590a..36281536 100644 --- a/tools/gemBS_plugins/snpxtr.c +++ b/tools/gemBS_plugins/snpxtr.c @@ -582,7 +582,7 @@ bcf1_t *process(bcf1_t *rec) if(args.gt[i]) passed = true; } if(passed) { - fprintf(fp, "%s\t%d\t%s", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + 1, id); + fprintf(fp, "%s\t%" PRId64 "\t%s", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + 1, id); for(int i = 0; i < ns; i++) { const int gt = args.gt[i]; if(gt > 0) fprintf(fp, "\t%s%s", rec->d.allele[(gt >> 4) - 1], rec->d.allele[(gt & 7) - 1]); From 06de68c9317bf42a56be8ed679e2b074188e2ab9 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 11 Dec 2019 12:19:00 +0100 Subject: [PATCH 12/61] Bump version --- README.md | 3 ++- gemBS/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 88c783c0..81a1fe15 100644 --- a/README.md +++ b/README.md @@ -92,7 +92,8 @@ Documentation can be found at ---------- Changelog: ---------- - + 3.4.1 Add benchmark-mode that does not write date or program version numbers into SAM/BAM or VCF/BCF files + Switch to samtools, bcftools and htslib v1.10 3.4.0 Move to new bs_call version (2.1.0) which is more efficient in memory use and can read BAMs and write BCFs natively. The new bs_call requires a faidx indexed reference, so gemBS diff --git a/gemBS/version.py b/gemBS/version.py index 2d5cc7fb..48b6e071 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "4" -__VERSION_SUBMINOR = "0" +__VERSION_SUBMINOR = "1" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) From a2d782993c63efe739ce36fa9403f50346022a7e Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 14 Dec 2019 10:04:17 +0100 Subject: [PATCH 13/61] New bs_call version with improved CRAM support --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index eee11f42..67305772 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit eee11f420f664b01c06d232ee40e1d7c019f5916 +Subproject commit 673057723473a3e453f1a2630c08d029a47694d1 From 81fc4ece4009666de776da4320bb2b047682669d Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 14 Dec 2019 10:07:33 +0100 Subject: [PATCH 14/61] CRAM support: Add md5_fasta to calculate md5 tags for reference sequences and (if required) populate reference cache to htslib. Modify readNameClean to inject M5: tages into @SQ header lines --- tools/utils/Makefile | 20 +- tools/utils/gemBS_cat.c | 491 ++------------------------- tools/utils/md5_fasta.c | 299 ++++++++++++++++ tools/utils/readNameClean.c | 161 +++++++-- tools/utils/uthash.h | 1 + tools/utils/utils.c | 448 ++++++++++++++++++++++++ tools/utils/{gemBS_cat.h => utils.h} | 1 + 7 files changed, 921 insertions(+), 500 deletions(-) create mode 100644 tools/utils/md5_fasta.c create mode 120000 tools/utils/uthash.h create mode 100644 tools/utils/utils.c rename tools/utils/{gemBS_cat.h => utils.h} (96%) diff --git a/tools/utils/Makefile b/tools/utils/Makefile index 00015c7f..6698d071 100644 --- a/tools/utils/Makefile +++ b/tools/utils/Makefile @@ -13,10 +13,8 @@ CC=gcc ROOT_PATH=.. -TOOLS=gemBS_cat readNameClean - +TOOLS=gemBS_cat readNameClean md5_fasta FOLDER_BIN=../bin -TOOLS_SRC=$(addsuffix .c, $(TOOLS)) TOOLS_BIN=$(addprefix $(FOLDER_BIN)/, $(TOOLS)) LIBS:= -lm @@ -30,5 +28,17 @@ static: $(TOOLS_BIN) debug: TOOLS_FLAGS=-O0 $(GENERAL_FLAGS) $(ARCH_FLAGS) $(DEBUG_FLAGS) debug: $(TOOLS_BIN) -$(TOOLS_BIN): $(TOOLS_SRC) - $(CC) --std=c99 $(TOOLS_FLAGS) -o $@ $(notdir $@).c $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) +clean: + rm -f *~ *.o *.a + +utils.o: utils.c utils.h + $(CC) $(TOOLS_FLAGS) -c utils.c + +$(FOLDER_BIN)/gemBS_cat: gemBS_cat.c utils.o + $(CC) $(TOOLS_FLAGS) -o $@ gemBS_cat.c utils.o $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) + +$(FOLDER_BIN)/md5_fasta: md5_fasta.c utils.o + $(CC) $(TOOLS_FLAGS) -o $@ md5_fasta.c utils.o $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS)-lcrypto + +$(FOLDER_BIN)/readNameClean: readNameClean.c utils.o + $(CC) $(TOOLS_FLAGS) -o $@ readNameClean.c utils.o $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) diff --git a/tools/utils/gemBS_cat.c b/tools/utils/gemBS_cat.c index 57c7e5fa..8455f9b2 100644 --- a/tools/utils/gemBS_cat.c +++ b/tools/utils/gemBS_cat.c @@ -3,479 +3,40 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include - -#define STDIN STDIN_FILENO -#define STDOUT STDOUT_FILENO - -#include "gemBS_cat.h" - -static pthread_mutex_t compress_lock; -static struct compress compress_data; +#include -static void free_compress(void) { - int i, j; +#include "utils.h" - if (compress_data.initialized) { - pthread_mutex_lock(&compress_lock); - if (compress_data.initialized) { - for (i = 0; i < COMPRESS_NONE; i++) { - free(compress_data.compress_suffix[i]); - for (j = 0; j < 2; j++) - if (compress_data.comp_path[i][j]) - free(compress_data.comp_path[i][j]); - } - compress_data.initialized = false; - } - pthread_mutex_unlock(&compress_lock); - } +static void cat_file(FILE *fp) { + while(!feof(fp)) { + int c = fgetc(fp); + if(c == EOF) break; + putchar(c); + } } -static void init_compress(void) { - int i, j; - char *pnames[][2] = { - {"bgzip", NULL}, {"gzip", NULL}, {"bzip2", NULL}, {"xz", NULL}, {"compress", NULL}, {NULL, NULL}}; - int compress_type[] = {COMPRESS_GZIP, COMPRESS_GZIP, COMPRESS_BZIP2, COMPRESS_XZ, COMPRESS_COMPRESS, COMPRESS_NONE}; - char *suff[] = {"gz", "bz2", "xz", "Z"}; - char *path; +int main(int argc, char *argv[]) { + int err = 0; - if (!compress_data.initialized) { - pthread_mutex_lock(&compress_lock); - errno = 0; - if (!compress_data.initialized) { - (void)setlocale(LC_ALL, ""); - if (!(path = getenv("PATH"))) - path = DEFAULT_PATH; - for (i = 0; i < COMPRESS_NONE; i++) { - compress_data.compress_suffix[i] = strdup(suff[i]); - compress_data.comp_path[i][0] = compress_data.comp_path[i][0] = NULL; + for(int ix = 1; ix <= argc; ix++) { + if(ix == argc) { + if(argc == 1) { + cat_file(stdin); + } + } else { + bool flag; + FILE *fp = open_readfile(argv[ix], &flag); + if(fp == NULL) { + err = errno; + break; } - int ix = 0; - while(pnames[ix][0] != NULL) { - i = compress_type[ix]; - if(compress_data.comp_path[i][0] == NULL) { - for (j = 0; j < 2; j++) - compress_data.comp_path[i][j] = pnames[ix][j] ? find_prog(pnames[ix][j], path) : NULL; - } - ix++; + cat_file(fp); + fclose(fp); + if(flag) { + while(waitpid(-1, NULL, 0) > 0); } - for (i = 0; i < COMPRESS_NONE; i++) if (compress_data.comp_path[i][0] != NULL) break; - compress_data.default_compress = i; - if (atexit(free_compress)) - fprintf(stderr, "Warning: Unable to register exit function free_compress()\n"); - compress_data.initialized = true; } - errno = 0; - pthread_mutex_unlock(&compress_lock); } -} - -struct compress* get_compress_data(void) { - init_compress(); - return &compress_data; -} - -void qstrip(char *s) { - char *p, *p1; - - p = s; - p1 = s - 1; - while (*s) { - if (!isspace((int)*s)) - break; - s++; - } - while (*s) { - if (!isspace((int)*s)) - p1 = p; - *(p++) = *(s++); - } - *(++p1) = '\0'; -} - -tokens *tokenize(char *s, const int ch, tokens *tok) { - int n_toks = 0; - char **p = 0, *p1; - - if (!tok) { - tok = malloc(sizeof(tokens)); - if (tok) { - tok->size = 16; - if (tok) { - if (!(tok->toks = malloc(sizeof(void *) * tok->size))) { - free(tok); - tok = NULL; - } - } - } - } - if (tok != NULL) { - p = tok->toks; - if ((p1 = s)) { - if (!ch) { /* Split on white space */ - for (;;) { - while (*s && isspace((int)*s)) - s++; - if (!*s) - break; - if (n_toks == tok->size) { - tok->size <<= 1; - if (!(p = realloc(p, sizeof(void *) * tok->size))) { - free_tokens(tok); - tok = NULL; - break; - } - tok->toks = p; - } - p[n_toks++] = p1; - while (*s && !isspace((int)*s)) { - *p1++ = *s++; - } - if (*s) - s++; - *p1++ = 0; - } - } else { /* Split on token */ - for (;;) { - if (!*s) - break; - if (n_toks == tok->size) { - tok->size <<= 1; - if (!(p = realloc(p, sizeof(void *) * tok->size))) { - free_tokens(tok); - tok = NULL; - break; - } - tok->toks = p; - } - p[n_toks++] = p1; - while (*s && *s != ch) { - *p1++ = *s++; - } - if (*s) - s++; - *p1++ = 0; - qstrip(p[n_toks - 1]); - } - } - } - } - if (tok != NULL) { - if (n_toks == 1 && !*p[0]) - n_toks--; - tok->n_tok = n_toks; - } - return tok; -} - -char *find_prog(const char *prog, const char *path) { - char *p, *p1, *path1, *prog1, name[MAXPATHLEN]; - int sz, sz1, found, i; - struct stat buf; - tokens *tok; - - prog1 = strdup(prog); - found = 0; - tok = tokenize(prog1, ':', 0); - for (i = 0; !found && i < tok->n_tok; i++) { - sz1 = (int)strlen(tok->toks[i]); - if (!(p1 = path1 = strdup(path))) - return 0; - while ((p = strsep(&path1, ":"))) { - if (!*p) { - p = "."; - sz = 1; - } else { - sz = (int)strlen(p); - while (p[sz - 1] == '/') - p[--sz] = 0; - } - assert(sz + sz1 + 1 < MAXPATHLEN); - (void)snprintf(name, MAXPATHLEN, "%s/%s", p, tok->toks[i]); - if (!stat(name, &buf) && S_ISREG(buf.st_mode) && !access(name, X_OK)) { - found = 1; - break; - } - } - (void)free(p1); - } - free(prog1); - if (tok) - free_tokens(tok); - if (found) { - return strdup(name); - } - return 0; -} - -static void ignore_handler(__attribute__((unused)) int i) { /* Do nothing */ -} - -static int _child_open(const int read_flag, const char *fname, - const char *filterprog, const char *arg) { - int ppipe[2] = {-1, -1}, fd = -1, fd1; - struct stat sbuf; - struct sigaction s_action; - int childpid; - - if (read_flag == READ && fname) - if (stat(fname, &sbuf)) - return fd; - if (pipe(ppipe) < 0) { - (void)fprintf(stderr, "_child_open(): Can't open pipe\n"); - return fd; - } - childpid = fork(); - if (childpid < 0) { - (void)fprintf(stderr, "_child_open(): cannot fork\n"); - return fd; - } - if (childpid > 0) { /* Parent process */ - if (read_flag == READ) { - fd = ppipe[READ]; - if (close(ppipe[WRITE]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - } else { - fd = ppipe[WRITE]; - if (close(ppipe[READ]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - } - } else { /* Child process */ - errno = 0; - if (read_flag == READ) { - dup2(ppipe[WRITE], STDOUT); - if (close(ppipe[READ]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - if (fname) { - fd1 = open(fname, O_RDONLY, 0666); - if (fd1 < 0) { - (void)fprintf(stderr, "_child_open(): cannot open file %s\n", fname); - exit(EXIT_FAILURE); - } - dup2(fd1, STDIN); - } - } else { - dup2(ppipe[READ], STDIN); - if (close(ppipe[WRITE]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - if (fname) { - fd1 = creat(fname, 0666); - if (fd1 < 0) { - (void)fprintf(stderr, "_child_open(): cannot open file %s\n", fname); - exit(EXIT_FAILURE); - } - dup2(fd1, STDOUT); - } - } - memset(&s_action, 0, sizeof(struct sigaction)); - s_action.sa_handler = ignore_handler; - s_action.sa_flags = 0; - (void)sigaction(SIGHUP, &s_action, 0L); - (void)sigaction(SIGINT, &s_action, 0L); - (void)sigaction(SIGQUIT, &s_action, 0L); - (void)sigaction(SIGPIPE, &s_action, 0L); - if (read_flag == READ) - (void)execlp(filterprog, filterprog, arg, (char *)0); - else - (void)execlp(filterprog, filterprog, arg, (char *)0); - (void)fprintf(stderr, "child_open(): cannot exec %s\n", filterprog); - _exit(EXIT_FAILURE); - } - return fd; -} - -int child_open(const int read_flag, const char *fname, const char *filterprog) { - int fd; - - if (read_flag == READ) - fd = _child_open(read_flag, fname, filterprog, "-d"); - else - fd = _child_open(read_flag, fname, filterprog, 0); - return fd; -} - -int child_open_rw(int fd[2], const char *filterprog, char *const argv[]) { - int read_pipe[2] = {-1, -1}, write_pipe[2] = {-1, -1}; - struct sigaction s_action; - int childpid; - - fd[0] = fd[1] = -1; - /* Open up a read pipe (from the filter) and a write pipe (to the filter) */ - if (pipe(read_pipe) < 0 || pipe(write_pipe) < 0) { - (void)fprintf(stderr, "child_open_rw(): Can't open pipe\n"); - return -1; - } - childpid = fork(); - if (childpid < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot fork\n"); - return -1; - } - - if (childpid > 0) { - /* In parent process */ - - /* Close write end of read pipe */ - fd[READ] = read_pipe[READ]; - if (close(read_pipe[WRITE]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - /* Close read end of write pipe */ - fd[WRITE] = write_pipe[WRITE]; - if (close(write_pipe[READ]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - } else { - /* In child process */ - - /* Duplicate STDOUT to write end of read pipe, and close read end */ - dup2(read_pipe[WRITE], STDOUT); - if (close(read_pipe[READ]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - /* Duplicate STDIN to read end of write pipe, and close write end */ - dup2(write_pipe[READ], STDIN); - if (close(write_pipe[WRITE]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - s_action.sa_handler = ignore_handler; - s_action.sa_flags = 0; - (void)sigaction(SIGHUP, &s_action, 0L); - (void)sigaction(SIGINT, &s_action, 0L); - (void)sigaction(SIGQUIT, &s_action, 0L); - (void)sigaction(SIGPIPE, &s_action, 0L); - (void)execv(filterprog, argv); - (void)fprintf(stderr, "child_open_rw(): cannot exec %s\n", filterprog); - _exit(EXIT_FAILURE); - } - return 0; -} - -FILE *_open_readfile(const char *fname, bool *flag, bool chk_flag) { - int guess = COMPRESS_NONE; - FILE *fptr; - unsigned char buf[6]; - char *filter; - char *prog[] = {"gzip", "bzip2", "zip", "compress"}; - - errno = 0; - *flag = false; - if (fname == NULL) - return stdin; - struct compress *compress = get_compress_data(); - if (!(fptr = fopen(fname, "r"))) { - fprintf(stderr, "File Error: Couldn't open '%s' for reading (%s)\n", fname, - strerror(errno)); - if (chk_flag) - exit(-1); - else - return 0; - } - int i = (int)fread(buf, (size_t)1, (size_t)6, fptr); - if (i == 6) { - if (buf[0] == 0x1f) { - if (buf[1] == 0x9d) - guess = COMPRESS_COMPRESS; /* compress */ - else { - if (buf[1] == 0x8b && buf[2] == 0x08) - guess = COMPRESS_GZIP; /* gzip */ - } - } else { - if (buf[0] == 'B' && buf[1] == 'Z' && buf[2] == 'h' && buf[3] >= '0' && - buf[3] <= '9') - guess = COMPRESS_BZIP2; /* bzip2 */ - else { - if (buf[0] == 0xfd && buf[1] == '7' && buf[2] == 'z' && buf[3] == 'X' && buf[4] == 'Z' && buf[5] == 0) - guess = COMPRESS_XZ; /* xz */ - } - } - } - fclose(fptr); - if (guess < COMPRESS_NONE) { - filter = compress->comp_path[guess][0]; - if (filter) { - *flag = true; - i = _child_open(READ, fname, filter, "-d"); - if (!(fptr = fdopen(i, "r"))) { - fputs("Couldn't fdopen() stream", stderr); - exit(-1); - } - if (errno && errno != ESPIPE) { - fputs("Unknown IO error\n", stderr); - exit(-1); - } - errno = 0; - } else { - fprintf(stderr, "File '%s' appears to have been " - "compressed using %s, which is not in the " - "current $PATH\n", - fname, prog[guess]); - if (chk_flag) - exit(-1); - fptr = 0; - } - } else { - if (!(fptr = fopen(fname, "r"))) { - fprintf(stderr, "File Error Couldn't open '%s' for reading (%s)\n", - fname, strerror(errno)); - if (chk_flag) - exit(-1); - } - } - return fptr; -} - -void cat_file(FILE *fp) { - while(!feof(fp)) { - int c = fgetc(fp); - if(c == EOF) break; - putchar(c); - } -} - -int main(int argc, char *argv[]) { - int err = 0; - - for(int ix = 1; ix <= argc; ix++) { - if(ix == argc) { - if(argc == 1) { - cat_file(stdin); - } - } else { - bool flag; - FILE *fp = open_readfile(argv[ix], &flag); - if(fp == NULL) { - err = errno; - break; - } - cat_file(fp); - fclose(fp); - if(flag) { - while(waitpid(-1, NULL, 0) > 0); - } - } - } - return err; + return err; } diff --git a/tools/utils/md5_fasta.c b/tools/utils/md5_fasta.c new file mode 100644 index 00000000..367a7e1e --- /dev/null +++ b/tools/utils/md5_fasta.c @@ -0,0 +1,299 @@ +/* + * md5_fasta.c + * + * Created on: Dec 12, 2019 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + +#ifndef PATH_MAX +#define PATH_MAX FILENAME_MAX +#endif + +static void usage(FILE * const f) { + fputs("usage:\n md5_fasta [option]... [file]...\n", f); + fputs(" -o|--output \n", f); + fputs(" -s|--stream stream input to stdout\n", f); + fputs(" -p|--populate-cache populate local cache\n", f); + fputs(" -h|--help print this message\n", f); +} + +static char btab[256]; +static char path[PATH_MAX]; + +static int get_cache_fname(char *path, const char * cache, const char * str) { + size_t s = PATH_MAX; + size_t ln; + char *init = path; + char *p; + while((p = strchr(cache, '%'))) { + ln = p - cache; + if(ln >= s) return -1; + while(cache < p) *path++ = *cache++; + s -= ln; + if(*++p == 's') { + ln = strlen(str); + if(ln > s) return -1; + strcpy(path, str); + s -= ln; + path += ln; + str += ln; + p++; + } else if(*p >= '0' && *p <= '9') { + char *p1; + long l = strtol(p, &p1, 10); + ln = strlen(str); + if(l > ln) l = ln; + if(*p1 == 's') { + if(l >= s) return -1; + s -= l; + while(l--) *path++ = *str++; + p = p1 + 1; + } else { + if(s < 3) return -1; + *path++ = '%'; + *path++ = *p++; + } + } else { + if(s < 3) return -1; + *path++ = '%'; + *path++ = *p++; + } + cache = p; + } + ln = strlen(cache); + if(ln >= s) return -1; + s -= ln; + while(ln--) *path++ = *cache++; + bool need_slash = (*str && path > init && path[-1] != '/'); + ln = strlen(str) + need_slash ? 1 : 0; + if(ln >= s) return -1; + if(need_slash) *path++ = '/'; + strcpy(path, str); + return 0; +} + +static int mk_path(char *newpath) { + static char path[PATH_MAX]; + if(strlen(newpath) >= PATH_MAX) return -1; + struct stat sb; + char *p, *p1 = path; + if(*newpath == '/') *p1++ = *newpath++; + while((p = strchr(newpath, '/'))) { + while(newpath < p) *p1++ = *newpath++; + *p1 = 0; + if(stat(path, &sb) == 0) { + if(!S_ISDIR(sb.st_mode)) return -1; + } else { + if(errno != ENOENT) return -1; + if(mkdir(path, 0777)) return -1; + } + *p1++ = *newpath++; + } + return 0; +} + +static void handle_md5(const char * const ctg, MD5_CTX * const ctx, size_t tlen, + const char * const cache, const char * const ref_buf, size_t ref_len, FILE *fout) { + unsigned char b[16]; + char md5[33]; + char *hex_digits="0123456789abcdef"; + MD5_Final(b, ctx); + const char *p = ctg; + int k = 0; + for(int i = 0; i < 16; i++) { + md5[k++] = hex_digits[b[i] >> 4]; + md5[k++] = hex_digits[b[i] & 0xf]; + } + md5[k] = 0; + while(*p && !isspace(*p)) fputc(*p++, fout); + fprintf(fout,"\tLN:%zu\tM5:%s", tlen, md5); + while(*p) { + while(*p && isspace(*p)) p++; + if(!(strncmp(p, "AS:", 3) && strncmp(p, "SP:", 3))) { + fputc('\t', fout); + while(*p && !isspace(*p)) fputc(*p++, fout); + } else while(*p && !isspace(*p)) p++; + } + fputc('\n', fout); + if(cache && ref_buf) { + if(get_cache_fname(path, cache, md5) == 0) { + struct stat sb; + if(stat(path, &sb) != 0) { + if(errno == ENOENT) { + fprintf(stderr, "Creating cache file %s\n", path); + if(mk_path(path) == 0) { + FILE *fp = fopen(path, "w"); + if(fp) { + size_t s = fwrite(ref_buf, 1, ref_len, fp); + fclose(fp); + if(s != ref_len) { + fprintf(stderr, "Error writing cache file %s: %s\n", path, strerror(errno)); + unlink(path); + } + } else fprintf(stderr, "Could not create cache file %s: %s\n", path, strerror(errno)); + } else { + fprintf(stderr, "Could not create directory path for cache file %s\n", path); + } + } else { + fprintf(stderr, "Could not access directory path for cache file %s: %s\n", path, strerror(errno)); + } + } + } else { + fprintf(stderr, "Could not create directory path for cache file %s: path too long\n", path); + } + } +} + +static void process_file(FILE * const fp, FILE * const fout, const bool stream, const char * const cache) { + char *buf = NULL; + char *ref_buf = NULL; + size_t ref_buf_size = 0, ref_len = 0; + if(cache) { + ref_buf_size = 16384; + ref_buf = malloc(ref_buf_size); + if(!ref_buf) { + fprintf(stderr, "md5_fasta:process_file() Out of memory"); + exit(-1); + } + } + size_t buf_size = 0, tlen = 0; + ssize_t l; + char *ctg = NULL; + MD5_CTX ctx; + // Process header lines - no conversion + while(1) { + l = getline(&buf, &buf_size, fp); + if(l < 0) break; + if(stream) fputs(buf, stdout); + if(buf[0] == '>') { + if(ctg) { + handle_md5(ctg, &ctx, tlen, cache, ref_buf, ref_len, fout); + ref_len = 0; + free(ctg); + } + if(!buf[0]) continue; + ctg = strdup(buf + 1); + MD5_Init(&ctx); + tlen = 0; + } else { + // First, strip characters not between 33 and 126, and convert to upper case. + char *p = buf, *p1 = buf; + while(*p) { + char c = btab[(int)*p++]; + if(c) *p1++ = c; + } + size_t len = p1 - buf; + if(len) { + MD5_Update(&ctx, buf, len); + tlen += len; + if(ref_buf) { + if(len + ref_len > ref_buf_size) { + ref_buf_size = (ref_buf_size + len) * 1.5; + ref_buf = realloc(ref_buf, ref_buf_size); + if(!ref_buf) { + fprintf(stderr, "md5_fasta:process_file() Out of memory"); + exit(-1); + } + } + memcpy(ref_buf + ref_len, buf, len); + ref_len += len; + } + } + } + } + if(ctg) { + handle_md5(ctg, &ctx, tlen, cache, ref_buf, ref_len, fout); + free(ctg); + } + if(buf != NULL) free(buf); + if(ref_buf != NULL) free(ref_buf); +} + +static char *get_cache_path(void) { + char *ref_cache = getenv("REF_CACHE"); + if(!ref_cache || *ref_cache == 0) { + // This is the same logic used in htslib/cram/cram_io.c + char *ext = NULL; + char *base = getenv("XDG_CACHE_HOME"); + if(!(base && *base)) { + base = getenv("HOME"); + if(base && *base) ext = "/.cache"; + } + if(!(base && *base)) base = getenv("TMPDIR"); + if(!(base && *base)) base = getenv("TEMP"); + if(!(base && *base)) base = "/tmp"; + if(!ext) ext = ""; + ref_cache = malloc(PATH_MAX); + snprintf(ref_cache, PATH_MAX, "%s%s/hts-ref/%%2s/%%2s/%%s", base, ext); + } + return ref_cache; +} + +int main(int argc, char* argv[]) { + struct option longopts[] = { + {"output", required_argument, 0, 'o'}, + {"stream", no_argument, 0, 's'}, + {"populate-cache", no_argument, 0, 'p'}, + {"help", no_argument, 0, 'h'}, + {"usage", no_argument, 0, 'h'}, + {NULL, 0, 0, 0} + }; + int err = 0; + char *output = NULL; + bool stream = false; + bool populate_cache = false; + for(int i = 33; i < 127; i++) btab[i] = toupper(i); + int c; + while(!err && (c = getopt_long(argc, argv, "o:psh?", longopts, 0)) != -1) { + switch(c) { + case 'o': + output = optarg; + break; + case 's': + stream = true; + break; + case 'p': + populate_cache = true; + break; + case 'h': + case '?': + usage(stdout); + } + } + char *cache = NULL; + if(populate_cache) cache = get_cache_path(); + FILE *fout = NULL; + if(output) fout = fopen(output, "w"); + if(fout == NULL) fout = stdout; + for(int ix = optind; ix <= argc; ix++) { + if(ix == argc) { + if(argc == optind) process_file(stdin, fout, stream, cache); + } else { + bool flag; + FILE *fp = open_readfile(argv[ix], &flag); + if(fp == NULL) { + err = errno; + break; + } + process_file(fp, fout, stream, cache); + fclose(fp); + if(flag) while(waitpid(-1, NULL, 0) > 0); + } + } + if(fout != stdout) fclose(fout); + return err; +} diff --git a/tools/utils/readNameClean.c b/tools/utils/readNameClean.c index 0c0b08db..29d0817f 100644 --- a/tools/utils/readNameClean.c +++ b/tools/utils/readNameClean.c @@ -2,37 +2,138 @@ #include #include #include +#include +#include -// Strip illegal characters from Read Ids in SAMFILE +#include "utils.h" +#include "uthash.h" + +// Strip illegal characters from Read IDs in SAM file // Valid characters are [!-?A-~] -int main(void) { - FILE *fp = stdin; - char *buf = NULL; - size_t buf_size = 0; - ssize_t l; - // Process header lines - no conversion - while(1) { - l = getline(&buf, &buf_size, fp); - if(l < 0) return 0; - if(buf[0] != '@') break; - fputs(buf, stdout); - } - // Process the rest of the file - while(l >= 0) { - int i; - bool found = false; - for(i = 0; i < l && buf[i] != '\t'; i++) if((found = (buf[i] == '@' || buf[i] < '!' || buf[i] > '~'))) break; - if(found) { - int j = i; - for(i = i + 1; i < l && buf[i] != '\t'; i++) { - if(buf[i] != '@' && buf[i] >= '!' && buf[i] <= '~') buf[j++] = buf[i]; - } - for(; i <= l; i++) buf[j++] = buf[i]; - } - fputs(buf, stdout); - l = getline(&buf, &buf_size, fp); - } - if(buf) free(buf); - return 0; +// Option to edit SAM headers, adding extra information to the @SQ lines + +#define NUM_SQTAGS 4 +static char *sqtags[NUM_SQTAGS] = { + "LN", "M5", "AS", "SP" +}; + +typedef struct { + char *name; + char *tags[NUM_SQTAGS]; + UT_hash_handle hh; +} ctg_t; + +ctg_t *process_ctg_file(char *name) { + ctg_t *ctgs = NULL; + bool flag; + FILE *fp = open_readfile(name, &flag); + if(fp == NULL) { + fprintf(stderr, "Could not open %s for reading\n", name); + exit(-1); + } + char *buf = NULL; + size_t buf_size = 0, tlen = 0; + ssize_t l; + tokens *tok = NULL; + while(1) { + l = getline(&buf, &buf_size, fp); + if(l < 0) break; + tok = tokenize(buf, '\t', tok); + if(tok->n_tok > 1) { + ctg_t *ct = NULL; + HASH_FIND_STR(ctgs, tok->toks[0], ct); + if(ct != NULL) { + fprintf(stderr, "process_ctg_file(): error - duplicate contig %s\n", tok->toks[0]); + exit(-1); + } + ct = malloc(sizeof(ctg_t)); + ct->name = strdup(tok->toks[0]); + for(int i = 0; i < NUM_SQTAGS; i++) ct->tags[i] = NULL; + for(int i = 1; i < tok->n_tok; i++) { + const char * const p = tok->toks[i]; + for(int j = 0; j < NUM_SQTAGS; j++) { + if(!strncmp(p, sqtags[j], 2) && p[2] == ':') { + ct->tags[j] = strdup(p + 3); + break; + } + } + } + HASH_ADD_KEYPTR(hh, ctgs, ct->name, strlen(ct->name), ct); + } + } + fclose(fp); + if(flag) while(waitpid(-1, NULL, 0) > 0); + if(tok != NULL) free_tokens(tok); + if(buf != NULL) free(buf); + return ctgs; +} + +int main(int argc, char *argv[]) { + FILE *fp = stdin; + char *buf = NULL; + size_t buf_size = 0; + ssize_t l; + ctg_t *ctgs = NULL; + + if(argc > 1) ctgs = process_ctg_file(argv[1]); + // Process header lines - no conversion + while(1) { + l = getline(&buf, &buf_size, fp); + if(l < 0) return 0; + if(buf[0] != '@') break; + bool pflag = true; + if(l > 8 && !strncmp(buf + 1, "SQ\tSN:", 6)) { + char *p = buf + 7; + char *p1 = p; + while(*p1 && *p1 != '\t' && *p1 != '\n') p1++; + size_t l = p1 - p; + ctg_t *ct = NULL; + HASH_FIND(hh, ctgs, p, l, ct); + if(ct) { + pflag = false; + int mask = 0; + char c = *p1; + *p1 = 0; + fputs(buf, stdout); + while(c == '\t') { + p1++; + p = p1; + for(int j = 0; j < NUM_SQTAGS; j++) { + if(!strncmp(p1, sqtags[j], 2) && p1[2] == ':') { + mask |= (1 << j); + break; + } + } + while(*p1 && *p1 != '\t' && *p1 != '\n') p1++; + c = *p1; + *p1 = 0; + printf("\t%s", p); + } + int j = 0; + for(int j = 0; j < NUM_SQTAGS; j++) { + if(ct->tags[j] != NULL && !(mask & (1 << j))) printf("\t%s:%s", sqtags[j], ct->tags[j]); + } + fputc('\n', stdout); + } + } + if(pflag) fputs(buf, stdout); + } + // Process the rest of the file + while(l >= 0) { + int i; + bool found = false; + for(i = 0; i < l && buf[i] != '\t'; i++) if((found = (buf[i] == '@' || buf[i] < '!' || buf[i] > '~'))) break; + if(found) { + int j = i; + for(i = i + 1; i < l && buf[i] != '\t'; i++) { + if(buf[i] != '@' && buf[i] >= '!' && buf[i] <= '~') buf[j++] = buf[i]; + } + for(; i <= l; i++) buf[j++] = buf[i]; + } + fputs(buf, stdout); + l = getline(&buf, &buf_size, fp); + } + if(buf) free(buf); + return 0; } diff --git a/tools/utils/uthash.h b/tools/utils/uthash.h new file mode 120000 index 00000000..4aedc371 --- /dev/null +++ b/tools/utils/uthash.h @@ -0,0 +1 @@ +../bs_call/include/uthash.h \ No newline at end of file diff --git a/tools/utils/utils.c b/tools/utils/utils.c new file mode 100644 index 00000000..e3304be3 --- /dev/null +++ b/tools/utils/utils.c @@ -0,0 +1,448 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define STDIN STDIN_FILENO +#define STDOUT STDOUT_FILENO + +#include "utils.h" + +static pthread_mutex_t compress_lock; +static struct compress compress_data; + +static void free_compress(void) { + int i, j; + + if (compress_data.initialized) { + pthread_mutex_lock(&compress_lock); + if (compress_data.initialized) { + for (i = 0; i < COMPRESS_NONE; i++) { + free(compress_data.compress_suffix[i]); + for (j = 0; j < 2; j++) + if (compress_data.comp_path[i][j]) + free(compress_data.comp_path[i][j]); + } + compress_data.initialized = false; + } + pthread_mutex_unlock(&compress_lock); + } +} + +static void init_compress(void) { + int i, j; + char *pnames[][2] = { + {"bgzip", NULL}, {"gzip", NULL}, {"bzip2", NULL}, {"xz", NULL}, {"compress", NULL}, {NULL, NULL}}; + int compress_type[] = {COMPRESS_GZIP, COMPRESS_GZIP, COMPRESS_BZIP2, COMPRESS_XZ, COMPRESS_COMPRESS, COMPRESS_NONE}; + char *suff[] = {"gz", "bz2", "xz", "Z"}; + char *path; + + if (!compress_data.initialized) { + pthread_mutex_lock(&compress_lock); + errno = 0; + if (!compress_data.initialized) { + (void)setlocale(LC_ALL, ""); + if (!(path = getenv("PATH"))) + path = DEFAULT_PATH; + for (i = 0; i < COMPRESS_NONE; i++) { + compress_data.compress_suffix[i] = strdup(suff[i]); + compress_data.comp_path[i][0] = compress_data.comp_path[i][1] = NULL; + } + int ix = 0; + while(pnames[ix][0] != NULL) { + i = compress_type[ix]; + if(compress_data.comp_path[i][0] == NULL) { + for (j = 0; j < 2; j++) + compress_data.comp_path[i][j] = pnames[ix][j] ? find_prog(pnames[ix][j], path) : NULL; + } + ix++; + } + for (i = 0; i < COMPRESS_NONE; i++) if (compress_data.comp_path[i][0] != NULL) break; + compress_data.default_compress = i; + if (atexit(free_compress)) + fprintf(stderr, "Warning: Unable to register exit function free_compress()\n"); + compress_data.initialized = true; + } + errno = 0; + pthread_mutex_unlock(&compress_lock); + } +} + +struct compress* get_compress_data(void) { + init_compress(); + return &compress_data; +} + +void qstrip(char *s) { + char *p, *p1; + + p = s; + p1 = s - 1; + while (*s) { + if (!isspace((int)*s)) + break; + s++; + } + while (*s) { + if (!isspace((int)*s)) + p1 = p; + *(p++) = *(s++); + } + *(++p1) = '\0'; +} + +tokens *tokenize(char *s, const int ch, tokens *tok) { + int n_toks = 0; + char **p = 0, *p1; + + if (!tok) { + tok = malloc(sizeof(tokens)); + if (tok) { + tok->size = 16; + if (tok) { + if (!(tok->toks = malloc(sizeof(void *) * tok->size))) { + free(tok); + tok = NULL; + } + } + } + } + if (tok != NULL) { + p = tok->toks; + if ((p1 = s)) { + if (!ch) { /* Split on white space */ + for (;;) { + while (*s && isspace((int)*s)) + s++; + if (!*s) + break; + if (n_toks == tok->size) { + tok->size <<= 1; + if (!(p = realloc(p, sizeof(void *) * tok->size))) { + free_tokens(tok); + tok = NULL; + break; + } + tok->toks = p; + } + p[n_toks++] = p1; + while (*s && !isspace((int)*s)) { + *p1++ = *s++; + } + if (*s) + s++; + *p1++ = 0; + } + } else { /* Split on token */ + for (;;) { + if (!*s) + break; + if (n_toks == tok->size) { + tok->size <<= 1; + if (!(p = realloc(p, sizeof(void *) * tok->size))) { + free_tokens(tok); + tok = NULL; + break; + } + tok->toks = p; + } + p[n_toks++] = p1; + while (*s && *s != ch) { + *p1++ = *s++; + } + if (*s) + s++; + *p1++ = 0; + qstrip(p[n_toks - 1]); + } + } + } + } + if (tok != NULL) { + if (n_toks == 1 && !*p[0]) + n_toks--; + tok->n_tok = n_toks; + } + return tok; +} + +char *find_prog(const char *prog, const char *path) { + char *p, *p1, *path1, *prog1, name[MAXPATHLEN]; + int sz, sz1, found, i; + struct stat buf; + tokens *tok; + + prog1 = strdup(prog); + found = 0; + tok = tokenize(prog1, ':', 0); + for (i = 0; !found && i < tok->n_tok; i++) { + sz1 = (int)strlen(tok->toks[i]); + if (!(p1 = path1 = strdup(path))) + return 0; + while ((p = strsep(&path1, ":"))) { + if (!*p) { + p = "."; + sz = 1; + } else { + sz = (int)strlen(p); + while (p[sz - 1] == '/') + p[--sz] = 0; + } + assert(sz + sz1 + 1 < MAXPATHLEN); + (void)snprintf(name, MAXPATHLEN, "%s/%s", p, tok->toks[i]); + if (!stat(name, &buf) && S_ISREG(buf.st_mode) && !access(name, X_OK)) { + found = 1; + break; + } + } + (void)free(p1); + } + free(prog1); + if (tok) + free_tokens(tok); + if (found) { + return strdup(name); + } + return 0; +} + +static void ignore_handler(__attribute__((unused)) int i) { /* Do nothing */ +} + +static int _child_open(const int read_flag, const char *fname, + const char *filterprog, const char *arg) { + int ppipe[2] = {-1, -1}, fd = -1, fd1; + struct stat sbuf; + struct sigaction s_action; + int childpid; + + if (read_flag == READ && fname) + if (stat(fname, &sbuf)) + return fd; + if (pipe(ppipe) < 0) { + (void)fprintf(stderr, "_child_open(): Can't open pipe\n"); + return fd; + } + childpid = fork(); + if (childpid < 0) { + (void)fprintf(stderr, "_child_open(): cannot fork\n"); + return fd; + } + if (childpid > 0) { /* Parent process */ + if (read_flag == READ) { + fd = ppipe[READ]; + if (close(ppipe[WRITE]) < 0) { + (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + } else { + fd = ppipe[WRITE]; + if (close(ppipe[READ]) < 0) { + (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + } + } else { /* Child process */ + errno = 0; + if (read_flag == READ) { + dup2(ppipe[WRITE], STDOUT); + if (close(ppipe[READ]) < 0) { + (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + if (fname) { + fd1 = open(fname, O_RDONLY, 0666); + if (fd1 < 0) { + (void)fprintf(stderr, "_child_open(): cannot open file %s\n", fname); + exit(EXIT_FAILURE); + } + dup2(fd1, STDIN); + } + } else { + dup2(ppipe[READ], STDIN); + if (close(ppipe[WRITE]) < 0) { + (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + if (fname) { + fd1 = creat(fname, 0666); + if (fd1 < 0) { + (void)fprintf(stderr, "_child_open(): cannot open file %s\n", fname); + exit(EXIT_FAILURE); + } + dup2(fd1, STDOUT); + } + } + memset(&s_action, 0, sizeof(struct sigaction)); + s_action.sa_handler = ignore_handler; + s_action.sa_flags = 0; + (void)sigaction(SIGHUP, &s_action, 0L); + (void)sigaction(SIGINT, &s_action, 0L); + (void)sigaction(SIGQUIT, &s_action, 0L); + (void)sigaction(SIGPIPE, &s_action, 0L); + if (read_flag == READ) + (void)execlp(filterprog, filterprog, arg, (char *)0); + else + (void)execlp(filterprog, filterprog, arg, (char *)0); + (void)fprintf(stderr, "child_open(): cannot exec %s\n", filterprog); + _exit(EXIT_FAILURE); + } + return fd; +} + +int child_open(const int read_flag, const char *fname, const char *filterprog) { + int fd; + + if (read_flag == READ) + fd = _child_open(read_flag, fname, filterprog, "-d"); + else + fd = _child_open(read_flag, fname, filterprog, 0); + return fd; +} + +int child_open_rw(int fd[2], const char *filterprog, char *const argv[]) { + int read_pipe[2] = {-1, -1}, write_pipe[2] = {-1, -1}; + struct sigaction s_action; + int childpid; + + fd[0] = fd[1] = -1; + /* Open up a read pipe (from the filter) and a write pipe (to the filter) */ + if (pipe(read_pipe) < 0 || pipe(write_pipe) < 0) { + (void)fprintf(stderr, "child_open_rw(): Can't open pipe\n"); + return -1; + } + childpid = fork(); + if (childpid < 0) { + (void)fprintf(stderr, "child_open_rw(): cannot fork\n"); + return -1; + } + + if (childpid > 0) { + /* In parent process */ + + /* Close write end of read pipe */ + fd[READ] = read_pipe[READ]; + if (close(read_pipe[WRITE]) < 0) { + (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + /* Close read end of write pipe */ + fd[WRITE] = write_pipe[WRITE]; + if (close(write_pipe[READ]) < 0) { + (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + } else { + /* In child process */ + + /* Duplicate STDOUT to write end of read pipe, and close read end */ + dup2(read_pipe[WRITE], STDOUT); + if (close(read_pipe[READ]) < 0) { + (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + /* Duplicate STDIN to read end of write pipe, and close write end */ + dup2(write_pipe[READ], STDIN); + if (close(write_pipe[WRITE]) < 0) { + (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); + exit(EXIT_FAILURE); + } + s_action.sa_handler = ignore_handler; + s_action.sa_flags = 0; + (void)sigaction(SIGHUP, &s_action, 0L); + (void)sigaction(SIGINT, &s_action, 0L); + (void)sigaction(SIGQUIT, &s_action, 0L); + (void)sigaction(SIGPIPE, &s_action, 0L); + (void)execv(filterprog, argv); + (void)fprintf(stderr, "child_open_rw(): cannot exec %s\n", filterprog); + _exit(EXIT_FAILURE); + } + return 0; +} + +FILE *_open_readfile(const char *fname, bool *flag, bool chk_flag) { + int guess = COMPRESS_NONE; + FILE *fptr; + unsigned char buf[6]; + char *filter; + char *prog[] = {"gzip", "bzip2", "zip", "compress"}; + + errno = 0; + *flag = false; + if (fname == NULL) + return stdin; + struct compress *compress = get_compress_data(); + if (!(fptr = fopen(fname, "r"))) { + fprintf(stderr, "File Error: Couldn't open '%s' for reading (%s)\n", fname, + strerror(errno)); + if (chk_flag) + exit(-1); + else + return 0; + } + int i = (int)fread(buf, (size_t)1, (size_t)6, fptr); + if (i == 6) { + if (buf[0] == 0x1f) { + if (buf[1] == 0x9d) + guess = COMPRESS_COMPRESS; /* compress */ + else { + if (buf[1] == 0x8b && buf[2] == 0x08) + guess = COMPRESS_GZIP; /* gzip */ + } + } else { + if (buf[0] == 'B' && buf[1] == 'Z' && buf[2] == 'h' && buf[3] >= '0' && + buf[3] <= '9') + guess = COMPRESS_BZIP2; /* bzip2 */ + else { + if (buf[0] == 0xfd && buf[1] == '7' && buf[2] == 'z' && buf[3] == 'X' && buf[4] == 'Z' && buf[5] == 0) + guess = COMPRESS_XZ; /* xz */ + } + } + } + fclose(fptr); + if (guess < COMPRESS_NONE) { + filter = compress->comp_path[guess][0]; + if (filter) { + *flag = true; + i = _child_open(READ, fname, filter, "-d"); + if (!(fptr = fdopen(i, "r"))) { + fputs("Couldn't fdopen() stream", stderr); + exit(-1); + } + if (errno && errno != ESPIPE) { + fputs("Unknown IO error\n", stderr); + exit(-1); + } + errno = 0; + } else { + fprintf(stderr, "File '%s' appears to have been " + "compressed using %s, which is not in the " + "current $PATH\n", + fname, prog[guess]); + if (chk_flag) + exit(-1); + fptr = 0; + } + } else { + if (!(fptr = fopen(fname, "r"))) { + fprintf(stderr, "File Error Couldn't open '%s' for reading (%s)\n", + fname, strerror(errno)); + if (chk_flag) + exit(-1); + } + } + return fptr; +} diff --git a/tools/utils/gemBS_cat.h b/tools/utils/utils.h similarity index 96% rename from tools/utils/gemBS_cat.h rename to tools/utils/utils.h index 324e2a2a..06d8b8f0 100644 --- a/tools/utils/gemBS_cat.h +++ b/tools/utils/utils.h @@ -40,6 +40,7 @@ typedef struct { free(x); \ } +tokens *tokenize(char *s, const int ch, tokens *tok); char *find_prog(const char *prog, const char *path); FILE *_open_readfile(const char *fname, bool *flag, bool chk_flag); int child_open_rw(int fd[2],const char *filterprog,char *const argv[]); From 4e10ee18edbe38fdc5f6177d2b2a93887f3395cf Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 14 Dec 2019 10:08:35 +0100 Subject: [PATCH 15/61] CRAM support: configure with lzma --- tools/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index 575863c9..fbb26b71 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -83,13 +83,13 @@ gem3-mapper/Makefile.mk: gem3-mapper/Makefile.mk.in gem3-mapper/configure cd gem3-mapper; ./configure ${GEM3_CONFIG} $(SAMTOOLS_DIR)/config.mk: - cd $(SAMTOOLS_DIR); ./configure --disable-lzma + cd $(SAMTOOLS_DIR); ./configure # --disable-lzma $(FOLDER_BIN)/bcftools: $(BCFTOOLS_DIR) $(BCFTOOLS_DIR)/plugins/mextr.c $(BCFTOOLS_DIR)/plugins/snpxtr.c $(MAKE) --directory=$(BCFTOOLS_DIR) all $(BCFTOOLS_DIR)/config.h: - cd $(BCFTOOLS_DIR); ./configure --disable-lzma + cd $(BCFTOOLS_DIR); ./configure # --disable-lzma touch $(BCFTOOLS_DIR)/config.h $(SAMTOOLS_DIR): From e008e1a938eaa8144cd776e3888a0a7ee5784bb3 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 14 Dec 2019 10:10:07 +0100 Subject: [PATCH 16/61] Add CRAM support --- README.md | 1 + gemBS/__init__.py | 113 +++++++++++++++++++++++++------------------- gemBS/database.py | 19 +++++++- gemBS/parser.py | 4 +- gemBS/production.py | 52 +++++++++++++++----- gemBS/version.py | 2 +- setup.py | 2 +- 7 files changed, 129 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 81a1fe15..9ee42ca4 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,7 @@ Documentation can be found at ---------- Changelog: ---------- + 3.4.2 Add CRAM support (via make_cram option in configuration file) 3.4.1 Add benchmark-mode that does not write date or program version numbers into SAM/BAM or VCF/BCF files Switch to samtools, bcftools and htslib v1.10 3.4.0 Move to new bs_call version (2.1.0) which is more efficient diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 1f572bc4..c197bffa 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -92,6 +92,7 @@ def __getitem__(self, item): "bedToBigBed": "bedToBigBed", "dbSNP_idx": "dbSNP_idx", "gemBS_cat": "gemBS_cat", + "md5_fasta": "md5_fasta", "samtools": "samtools", "bcftools": "bcftools", "bgzip": "bgzip", @@ -512,12 +513,11 @@ def file_bgzipped(file_name): ret = (len(st) == 16 and st[0:4] == a and st[10:16] == b) return(ret) -def mk_gembs_reference(input_name, greference, extra_fasta_files=None, threads=None): +def mk_gembs_reference(input_name, greference, contig_md5, extra_fasta_files=None, threads=None, populate_cache=False): """Create bgzipped copy of reference file(s) in the same directory where - the index(es) are stored. If the supplied reference is already bgzipped then we - simply make a symbolic link to the existing reference. - This file will serve as the reference for the bs_call command, and for this - purpose fai and gzi indexes of the reference will be created. + the index(es) are stored. This file will serve as the reference for the + bs_call command, and for this purpose fai and gzi indexes of the reference will be created. + The contig_md5 files will be created at the same time. """ output_dir, base = os.path.split(greference) @@ -525,21 +525,41 @@ def mk_gembs_reference(input_name, greference, extra_fasta_files=None, threads=N os.makedirs(output_dir) if not os.path.exists(greference): - if file_bgzipped(input_name): + md5_fasta = [executables['md5_fasta'], '-o', contig_md5] + if populate_cache: + md5_fasta.append('-p') + if extra_fasta_files == None and file_bgzipped(input_name): os.symlink(os.path.abspath(input_name), greference) - else : - gcat = [executables['gemBS_cat'],input_name] + mk_ref = False + else: + md5_fasta.append('-s') + mk_ref = True + + md5_fasta.append(input_name) + if extra_fasta_files != None: + for f in extra_fasta_files: + if not os.path.exists(f): + raise CommandException("Reference file '{}' does not exist".format(f)) + md5_fasta.extend(extra_fasta_files) + if mk_ref: bgzip_bin = executables['bgzip'] if bgzip_bin == None: raise CommandException("bgzip binary not found (should be bundled with the gemBS distribution)\n"); bgzip_command = [bgzip_bin] if threads != None: bgzip_command.extend(['-@', str(threads)]); - process = run_tools([gcat,bgzip_command], name='gemBS_cat', output = greference) + process = run_tools([md5_fasta,bgzip_command], name='md5_fasta', output = greference) if process.wait() != 0: - if os.path.exists(greference): - os.remove(greference) + for f in [greference, md5_contig]: + if os.path.exists(f): + os.remove(f) raise ValueError("Error while making gemBS reference") + else: + process = run_tools([md5_fasta], name='md5_fasta', output = None) + if process.wait() != 0: + if os.path.exists(md5_contig): + os.remove(md5_contig) + raise ValueError("Error while making gemBS reference") process = run_tools([[executables['samtools'],'faidx',greference]], name='samtools faidx', output = 'greference.fai') if process.wait() != 0: @@ -547,10 +567,24 @@ def mk_gembs_reference(input_name, greference, extra_fasta_files=None, threads=N if os.path.exists(f): os.remove(f) raise ValueError("Error while making faidx index of gemBS reference") + +def mk_contig_md5(contig_md5, greference, populate_cache): + output_dir, base = os.path.split(contig_md5) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + md5 = [executables['md5_fasta'], '-o', contig_md5] + if populate_cache: + md5.append('-p') + md5.append(greference) + process = run_tools([md5], name='md5_fasta', output = None) + if process.wait() != 0: + if os.path.exists(contig_md5): + os.remove(contig_md5) + raise ValueError("Error while making contig md5 file") + return os.path.abspath(contig_md5) -def index(input_name, index_name, extra_fasta_files=None,threads=None,tmpDir=None,sampling_rate=None,nonbs_flag=False): - """Run the gem-indexer on the given input. Input has to be the path - to a single fasta file that contains the genome to be indexed. +def index(index_name, greference, threads=None,tmpDir=None,sampling_rate=None,nonbs_flag=False): + """Run the gem-indexer on the given gem reference. Output should be the path to the target index file. Note that the gem index has to end in .BS.gem and the prefix is added if necessary and the returned path will always be the correct path to the index. @@ -565,29 +599,12 @@ def index(input_name, index_name, extra_fasta_files=None,threads=None,tmpDir=Non output_dir, base = os.path.split(index_name) if not os.path.exists(output_dir): os.makedirs(output_dir) - if extra_fasta_files: - gcat = [executables['gemBS_cat'],input_name] - for f in extra_fasta_files: - if not os.path.exists(f): - raise CommandException("Reference file '{}' does not exist".format(f)) - - gcat.extend(extra_fasta_files) - output = index_base + "_gemBS.tmp.gz" - process = run_tools([gcat,['pigz']], name='gemBS_cat', output = output) - if process.wait() != 0: - if os.path.exists(output): - os.remove(output) - raise ValueError("Error while concatenating input fasta files") - f_in = output - else: - f_in = input_name - logfile = os.path.join(output_dir,"gem_indexer_" + base + ".err") logging.gemBS.gt("Creating index") indexer = [ executables['gem-indexer'], - '-i',f_in, + '-i',greference, '-o',index_base ] @@ -608,14 +625,11 @@ def index(input_name, index_name, extra_fasta_files=None,threads=None,tmpDir=Non process = run_tools([indexer], name="gem-indexer", logfile=logfile) if process.wait() != 0: - for f in (f_in, index_base + '.gem', index_base + '.info', index_base + '.sa.tmp'): - if os.path.exists(f) and f != input_name: + for f in (index_base + '.gem', index_base + '.info', index_base + '.sa.tmp'): + if os.path.exists(f): os.remove(f) raise ValueError("Error while executing the Bisulphite gem-indexer") - if f_in != input_name: - os.remove(f_in) - if index_name != index_base + ".gem": os.rename(index_base + ".gem", index_name) os.rename(index_base + ".info", index_name + ".info") @@ -687,7 +701,8 @@ def makeChromSizes(index_name=None,output=None): def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetype=None, read_non_stranded=False,reverse_conv=False,outfile=None, paired=False,tmpDir="/tmp",map_threads=None,sort_threads=None, - sort_memory=None,under_conversion=None, over_conversion=None, benchmark_mode=False): + sort_memory=None,under_conversion=None, over_conversion=None, + benchmark_mode=False, contig_md5=None, greference=None): """ Start the GEM Bisulfite mapping on the given input. name -- Name basic (FLI) for the input and output fastq files @@ -707,6 +722,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp under_conversion -- Under conversion sequence over_conversion -- Over conversion sequence benchmark_mode -- Remove times etc. from output files to simplify file comparisons + contig_md5 -- File with md5 sums for all contigs """ ## prepare the input input_pipe = [] @@ -765,7 +781,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp if over_conversion != "" and over_conversion != None: mapping.extend(["--overconversion-sequence",over_conversion]) #READ FILTERING - readNameClean = [executables['readNameClean']] + readNameClean = [executables['readNameClean'], contig_md5] #BAM SORT bamSort = [executables['samtools'],"sort","-T",os.path.join(tmpDir,name),"-@",sort_threads,"-m",sort_memory,"-o",outfile] @@ -773,7 +789,9 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp bamSort.append("--write-index") if benchmark_mode: bamSort.append("--no-PG") - bamSort.append("-"); + if outfile.endswith('.cram'): + bamSort.extend(['-O', 'CRAM', '--reference', greference ]); + bamSort.append('-'); tools = [mapping,readNameClean,bamSort] @@ -784,7 +802,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp return os.path.abspath("%s" % outfile) -def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benchmark_mode=False): +def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benchmark_mode=False, greference=None): """ Merge bam alignment files inputs -- Dictionary of samples and bam list files inputs(Key=sample, Value = [bam1,...,bamN]) @@ -798,7 +816,10 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc output = os.path.dirname(outname) bam_filename = outname - index_filename = outname[:-3] + 'bai' + if outname.endswith('.cram'): + index_filename = outname[:-4] + 'crai' + else: + index_filename = outname[:-3] + 'csi' md5_filename = outname + '.md5' bammerging = [] @@ -811,6 +832,8 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc bammerging.extend([executables['samtools'],"merge","--threads",threads,"--write-index"]) if benchmark_mode: bammerging.append("--no-PG") + if bam_filename.endswith('.cram'): + bammerging.extend(['-O', 'CRAM', '--reference', greference]); bammerging.extend(["-f",bam_filename]) for bamFile in inputs: bammerging.append(bamFile) @@ -819,14 +842,8 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc if process.wait() != 0: raise ValueError("Error while merging.") return_info.append(os.path.abspath(bam_filename)) - #Samtools index -# logfile = os.path.join(output,"bam_index_{}.err".format(sample)) -# indexing = [executables['samtools'], "index", "-@", threads, bam_filename, index_filename] md5sum = ['md5sum',bam_filename] -# processIndex = run_tools([indexing],name="Indexing",logfile=logfile) processMD5 = run_tools([md5sum],name="BAM MD5",output=md5_filename) -# if processIndex.wait() != 0: -# raise ValueError("Error while indexing BAM file.") if processMD5.wait() != 0: raise ValueError("Error while calculating md5sum of BAM file.") diff --git a/gemBS/database.py b/gemBS/database.py index 62f285b5..3506f2d6 100644 --- a/gemBS/database.py +++ b/gemBS/database.py @@ -3,6 +3,7 @@ import re import fnmatch import logging +import json import threading as th from .utils import CommandException @@ -162,13 +163,17 @@ def check_index(self): csizes = index + '.contig.sizes' if index == None: greference = os.path.join(index_dir, reference_basename) + '.gemBS.ref' + contig_md5 = os.path.join(index_dir, reference_basename) + '.gemBS.contig_md5' else: if index.endswith('.BS.gem'): greference = index[:-6] + 'gemBS.ref' + contig_md5 = index[:-6] + 'gemBS.contig_md5' elif index.endswith('.gem'): greference = index[:-3] + 'gemBS.ref' + contig_md5 = index[:-3] + 'gemBS.contig_md5' else: greference = index + '.gemBS.ref' + contig_md5 = index + '.gemBS.contig_md5' if index == None: index = os.path.join(index_dir, reference_basename) + '.BS.gem' index_ok = 1 if os.path.exists(index) else 0 @@ -190,9 +195,11 @@ def check_index(self): nonbs_index_ok = 0 csizes_ok = 1 if os.path.exists(csizes) else 0 greference_ok = 1 if os.path.exists(greference) and os.path.exists(greference + '.fai') and os.path.exists(greference + '.gzi') else 0 + contig_md5_ok = 1 if os.path.exists(contig_md5) else 0 c.execute("REPLACE INTO indexing VALUES (?, 'index', ?)",(index, index_ok)) c.execute("REPLACE INTO indexing VALUES (?, 'contig_sizes', ?)",(csizes,csizes_ok)) c.execute("REPLACE INTO indexing VALUES (?, 'gembs_reference', ?)",(greference,greference_ok)) + c.execute("REPLACE INTO indexing VALUES (?, 'contig_md5', ?)",(contig_md5,contig_md5_ok)) if nonbs_index != None: c.execute("REPLACE INTO indexing VALUES (?, 'nonbs_index', ?)",(nonbs_index,nonbs_index_ok)) else: @@ -209,7 +216,17 @@ def check_mapping(self, sync = False): sdata = js.sampleData fastq_dir = config['mapping'].get('sequence_dir', '.') bam_dir = config['mapping'].get('bam_dir', '.') + cram_flag = config['mapping'].get('make_cram', None) + if cram_flag != None: + cram_flag = json.loads(str(cram_flag).lower()) + else: + cram_flag = False + if cram_flag: + mapfile_suffix = 'cram' + else: + mapfile_suffix = 'bam' + c = self.cursor() slist = {} for k, v in sdata.items(): @@ -231,7 +248,7 @@ def check_mapping(self, sync = False): for bc, fli in slist.items(): sample = sdata[fli[0]].sample_name bam = bam_dir.replace('@BARCODE', bc).replace('@SAMPLE', sample) - sample_bam = os.path.join(bam, "{}.bam".format(bc)) + sample_bam = os.path.join(bam, "{}.{}".format(bc, mapfile_suffix)) key_used[sample_bam] = True old = old_tab.get(sample_bam, (0,0,0,0,0)) if database._mem_db or sync: diff --git a/gemBS/parser.py b/gemBS/parser.py index 96b07c5e..87e225dc 100755 --- a/gemBS/parser.py +++ b/gemBS/parser.py @@ -133,8 +133,8 @@ def read(self, infile): state = 0 known_var = { - 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'reverse_conversion', 'remove_individual_bams', 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir', 'benchmark_mode'), - 'index': ('index', 'index_dir', 'reference', 'extra_references', 'reference_basename', 'nonbs_index', 'contig_sizes', 'threads', 'dbsnp_files', 'dbsnp_index', 'sampling_rate'), + 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'reverse_conversion', 'remove_individual_bams', 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir', 'benchmark_mode', 'make_cram'), + 'index': ('index', 'index_dir', 'reference', 'extra_references', 'reference_basename', 'nonbs_index', 'contig_sizes', 'threads', 'dbsnp_files', 'dbsnp_index', 'sampling_rate', 'populate_cache'), 'calling': ('bcf_dir', 'mapq_threshold', 'qual_threshold', 'left_trim', 'right_trim', 'threads', 'jobs', 'species', 'keep_duplicates', 'keep_improper_pairs', 'remove_individual_bcfs', 'haploid', 'reference_bias', 'conversion', 'contig_list', 'contig_pool_limit', 'benchmark_mode'), 'extract': ('extract_dir', 'jobs', 'allow_het', 'phred_threshold', 'min_inform', 'strand_specific', 'min_bc', 'make_cpg', 'make_non_cpg', 'make_bedmethyl', diff --git a/gemBS/production.py b/gemBS/production.py index 5386d030..a88cbcd1 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -141,6 +141,7 @@ def register(self, parser): ## required parameters parser.add_argument('-t', '--threads', dest="threads", help='Number of threads. By default GEM indexer will use the maximum available on the system.',default=None) parser.add_argument('-s', '--sampling-rate', dest="sampling_rate", help='Text sampling rate. Increasing will decrease index size at the expense of slower mapping performance.',default=None) + parser.add_argument('-p', '--populate-cache', dest="populate_cache", help='Populate reference cache if required (for CRAM).',action="store_true",required=False,default=None) parser.add_argument('-d', '--list-dbSNP-files',dest="list_dbSNP_files",nargs="+",metavar="FILES", help="List of dbSNP files (can be compressed) to create an index to later use it at the bscall step. The bed files should have the name of the SNP in column 4.",default=[]) @@ -157,39 +158,48 @@ def run(self, args): fasta_input, fasta_input_ok = db_data['reference'] extra_fasta_files = jsonData.check(section='index',key='extra_references',arg=None,list_type=True,default=[]) + populate_cache = jsonData.check(section='index',key='populate_cache',arg=args.populate_cache, boolean=True) index_name, index_ok = db_data['index'] nonbs_index_name, nonbs_index_ok = db_data.get('nonbs_index',(None, 0)) csizes, csizes_ok = db_data['contig_sizes'] greference, greference_ok = db_data['gembs_reference'] + contig_md5, contig_md5_ok = db_data['contig_md5'] + + # We trigger a regeneration of the contig_md5 file if we want to check/populate the cache + if populate_cache: + contig_md5_ok = False dbsnp_index, dbsnp_ok = db_data.get('dbsnp_idx',(None, 0)) self.threads = jsonData.check(section='index',key='threads',arg=args.threads) args.sampling_rate = jsonData.check(section='index',key='sampling_rate',arg=args.sampling_rate) args.list_dbSNP_files = jsonData.check(section='index',key='dbsnp_files',arg=args.list_dbSNP_files,list_type=True,default=[]) if not fasta_input: raise ValueError('No input reference file specified for Index command') + if extra_fasta_files == []: + extra_fasta_files = None if greference_ok == 1: logging.warning("gemBS reference {} already exists, skipping creation".format(greference)) else: - ret = mk_gembs_reference(fasta_input, greference, extra_fasta_files=extra_fasta_files, threads=self.threads) + ret = mk_gembs_reference(fasta_input, greference, contig_md5, extra_fasta_files=extra_fasta_files, threads=self.threads, populate_cache=populate_cache) if ret: self.command = 'mk_gembs_reference' self.log_parameter() logging.gemBS.gt("gemBS reference done: {}".format(greference)) db.check_index() + if contig_md5 != None: + logging.gemBS.gt("Contig md5 file created: {}".format(contig_md5)) + contig_md5_ok = True if index_ok == 1: logging.warning("Bisulphite Index {} already exists, skipping indexing".format(index_name)) else: self.command = 'index' self.log_parameter() - - ret = index(fasta_input, index_name, extra_fasta_files=extra_fasta_files, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) + ret = index(index_name, greference, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) if os.path.exists(csizes): os.remove(csizes) csizes_ok = 0 if ret: logging.gemBS.gt("Index done: {}".format(index)) - db.check_index() if nonbs_index_name != None: if nonbs_index_ok == 1: @@ -197,11 +207,13 @@ def run(self, args): else: self.command = 'nonbs index' self.log_parameter() - - ret = index(fasta_input, nonbs_index_name, nonbs_flag=True, extra_fasta_files=extra_fasta_files, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) + ret = index(nonbs_index_name, greference, nonbs_flag=True, threads=self.threads, sampling_rate=args.sampling_rate, tmpDir=os.path.dirname(index_name)) if ret: logging.gemBS.gt("Non-bisulfite index done: {}".format(index)) - + if not contig_md5_ok: + ret = mk_contig_md5(contig_md5, greference, populate_cache) + if ret: + logging.gemBS.gt("Contig md5 file created: {}".format(contig_md5)) if dbsnp_index != None: if args.list_dbSNP_files: if dbsnp_ok: @@ -347,7 +359,6 @@ def run(self, args): self.overconversion_sequence = self.jsonData.check(section='mapping',key='overconversion_sequence',arg=args.overconversion_sequence) self.input_dir = self.jsonData.check(section='mapping',key='sequence_dir',arg=None,default='.',dir_type=True) - self.db = database(self.jsonData) self.db.check_index() self.mem_db = self.db.mem_db() @@ -361,7 +372,18 @@ def run(self, args): for ix_type in ('index', 'nonbs_index'): c.execute("SELECT file, status FROM indexing WHERE type = '{}'".format(ix_type)) self.index_status[ix_type] = c.fetchone() - + for fname, ftype, status in c.execute("SELECT * FROM indexing"): + if ftype == 'contig_md5': + if status != 1: + raise CommandException("contig md5 file {} not found. Run 'gemBS index' or correct configuration file and rerun".format(fname)) + else: + self.contig_md5 = fname; + elif ftype == 'gembs_reference': + if status != 1: + raise CommandException("gemBS reference {} not found. Run 'gemBS index' or correct configuration file and rerun".format(fname)) + else: + self.fasta_reference = fname + #Check Temp Directory if self.tmp_dir and not os.path.isdir(self.tmp_dir): raise CommandException("Temporary directory %s does not exists or is not a directory." %(self.tmp_dir)) @@ -589,7 +611,7 @@ def do_mapping(self, fli): outfile=outfile,paired=self.paired,tmpDir=tmp, map_threads=self.map_threads,sort_threads=self.sort_threads,sort_memory=self.sort_memory, under_conversion=self.underconversion_sequence,over_conversion=self.overconversion_sequence, - benchmark_mode=self.benchmark_mode) + benchmark_mode=self.benchmark_mode, contig_md5=self.contig_md5, greference=self.fasta_reference) if ret: logging.gemBS.gt("Bisulfite Mapping done. Output File: %s" %(ret)) @@ -651,7 +673,8 @@ def do_merge(self, sample, inputs, fname): desc = "merge {}".format(smp) self.json_commands[desc] = task else: - ret = merging(inputs = inputs, sample = sample, threads = self.merge_threads, outname = outfile, benchmark_mode=self.benchmark_mode) + ret = merging(inputs = inputs, sample = sample, threads = self.merge_threads, outname = outfile, + benchmark_mode=self.benchmark_mode, greference=self.fasta_reference) if ret: logging.gemBS.gt("Merging process done for {}. Output files generated: {}".format(sample, ','.join(ret))) @@ -787,6 +810,13 @@ def run(self, args): self.db.copy_to_mem() c = self.db.cursor() + for fname, ftype, status in c.execute("SELECT * FROM indexing"): + if ftype == 'gembs_reference': + if status != 1: + raise CommandException("gemBS reference {} not found. Run 'gemBS index' or correct configuration file and rerun".format(fname)) + else: + self.fasta_reference = fname + if args.sample: ret = c.execute("SELECT * from mapping WHERE sample = ?", (args.sample,)) else: diff --git a/gemBS/version.py b/gemBS/version.py index 48b6e071..9caaa075 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "4" -__VERSION_SUBMINOR = "1" +__VERSION_SUBMINOR = "2" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/setup.py b/setup.py index b971dace..ca3de3cd 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def _install_bundle(install_dir, inst): os.mkdir(gemBSbin_dir) # copy tools/bin - bins = ['gemBS_cat', 'readNameClean'] + bins = ['gemBS_cat', 'readNameClean', 'md5_fasta'] if not (inst.minimal or inst.no_kent): bins.extend(['wigToBigWig', 'bedToBigBed']) for file in bins: From 67b315be7c338a7384422e200a8848553eb700cc Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 14 Dec 2019 15:07:36 +0100 Subject: [PATCH 17/61] Remove bogus file creation during indexing --- gemBS/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index c197bffa..07a20e26 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -561,7 +561,7 @@ def mk_gembs_reference(input_name, greference, contig_md5, extra_fasta_files=Non os.remove(md5_contig) raise ValueError("Error while making gemBS reference") - process = run_tools([[executables['samtools'],'faidx',greference]], name='samtools faidx', output = 'greference.fai') + process = run_tools([[executables['samtools'],'faidx',greference]], name='samtools faidx', output = None) if process.wait() != 0: for f in [greference + '.fai', greference + '.gzi']: if os.path.exists(f): From 939a2464c87825f49e5d59708a32b4d8de333cd1 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 14 Dec 2019 15:17:21 +0100 Subject: [PATCH 18/61] fix linking of bs_call --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index 67305772..d22626b6 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 673057723473a3e453f1a2630c08d029a47694d1 +Subproject commit d22626b649c7c3327e2e6b64e8208daeac614a11 From a40fc3b2b6138d1b16d2f6b577a486d16696b660 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sun, 15 Dec 2019 08:03:24 +0100 Subject: [PATCH 19/61] Remove GOF filter. Add autodetection of bs_call output format --- README.md | 24 +++++------------------- gemBS/bsCallReports.py | 20 ++------------------ gemBS/bsCallSphinxReports.py | 8 +------- gemBS/bsCallStats.py | 24 +++++++++--------------- gemBS/version.py | 2 +- setup.py | 8 ++------ tools/bs_call | 2 +- 7 files changed, 21 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 9ee42ca4..471573dc 100644 --- a/README.md +++ b/README.md @@ -32,14 +32,14 @@ or check the installation of several packages. a) gcc with development libraries b) python3, pip3, matplotlib, multiprocess - c) zlib, gsl, libncurses + c) zlib, lzma, openssl, libcurl, libncurses, wget, pigz If you are working on a clean (fairly recent) Ubuntu installation, you can install everything required with the followiwg commands: sudo apt-get update sudo apt-get install -y python3 build-essential git python3-pip wget pigz - sudo apt-get install -y zlib1g-dev libbz2-dev gsl-bin libgsl0-dev + sudo apt-get install -y zlib1g-dev libbz2-dev sudo apt-get install -y libncurses5-dev liblzma-dev libssl-dev libcurl4-openssl-dev pip3 install matplotlib multiprocess @@ -58,22 +58,6 @@ To install to the user's home directory: ``python3 setup.py install --user`` -Note that gemBS requires that GSL (GNU scientific library) is -installed prior to the installation of gemBS. If GSL has been -installed on your system to the standard system location then the -above procedure should work without modification. If, however, GSL -has been installed to a non-standard location then then --gsl-path -option to install can be used. Fo example, if the installation prefix -for GSL is /opt/local (so the libraries are in /opt/local/lib and the -include directory gsl is in /opt/local/include) then the following -install command should be used: - - ``python3 setup.py install --gsl-path=/opt/local`` - -or - - ``python3 setup.py install --gsl-path=/opt/local --user`` - ----------------------- Check your installation ----------------------- @@ -81,7 +65,6 @@ Check your installation For checking your installation follow this [worked example](http://statgen.cnag.cat/gemBS/UserGuide/_build/html/example.html). - ------------- Documentation ------------- @@ -92,6 +75,9 @@ Documentation can be found at ---------- Changelog: ---------- + 3.4.3 Remove calculation of the goodness of filter (GOF) as this is expensive, non-standard and unreliable. Removing this + removes the dependency on GSL. + 3.4.3 Add autodetection of output format to bs_call (unless explicitly specified on the command line) 3.4.2 Add CRAM support (via make_cram option in configuration file) 3.4.1 Add benchmark-mode that does not write date or program version numbers into SAM/BAM or VCF/BCF files Switch to samtools, bcftools and htslib v1.10 diff --git a/gemBS/bsCallReports.py b/gemBS/bsCallReports.py index 04c4c9cb..927b9ab2 100644 --- a/gemBS/bsCallReports.py +++ b/gemBS/bsCallReports.py @@ -335,7 +335,7 @@ def configureStats(self,stats_vector=None): stats_vector - Vector of BsCallStats to build HtmlBsGenotypeCalls [totalStats,vcfFilterStats,variantCoverage,dbSnpCoverage,qualityVariant, - [fsVariant,qdVariant,qdNonVariant,rmsmqVariant,rmsmqNonVariant,gofVariant,gofNonVariant], + [fsVariant,qdVariant,qdNonVariant,rmsmqVariant,rmsmqNonVariant], mutations] """ self.total_stats = stats_vector[0] @@ -349,14 +349,7 @@ def configureStats(self,stats_vector=None): self.qdNonVariant = stats_vector[5][2] self.rmsmqVariant = stats_vector[5][3] self.rmsmqNonVariant = stats_vector[5][4] - self.gofVariant = stats_vector[5][5] - self.gofNonVariant = stats_vector[5][6] - #Set GoF Axis X Equal for both - number_locations_gof = self.gofVariant.selectTotalNumberLocations(self.gofNonVariant) - self.gofVariant.setNumberOfLocations(locationsToRecover=number_locations_gof) - self.gofNonVariant.setNumberOfLocations(locationsToRecover=number_locations_gof) - self.mutations = stats_vector[6] def createPage(self): @@ -388,11 +381,6 @@ def createPage(self): self.createSetPlots(stats_objects=[self.rmsmqVariant,self.rmsmqNonVariant],color="green", titles=["Root Mean Square of the mapping quality of reads. Variants.", "Root Mean Square of the mapping quality of reads. Non-Variants."]) - #5.4 GoodnessOfFit Variant NonVariant - self.createSetPlots(stats_objects=[self.gofVariant,self.gofNonVariant],color="blue", - titles=["Phred scaled goodness of fit to the diploid model. Variants.", - "Phred scaled goodness of fit to the diploid model. Non-Variants."]) - #6. Mutation All self.space() self.createTable(color="green",values=self.mutations.getTableMutationProfile(),tableTitle="Mutations") @@ -526,8 +514,6 @@ def buildSampleBscallReport(self,sample, lock = None): qdNonVariant = QCDistribution(concept="QualityByDepthNonVariant",typeDistribution="QualityByDepth",typeBaseLocation="NonVariant",pngFile=os.path.join(self.output_dir,'IMG',"{}_qd_nonvariant.png".format(sample))) rmsmqVariant = QCDistribution(concept="RMSMappingQualityVariant",typeDistribution="RMSMappingQuality",typeBaseLocation="Variant",pngFile=os.path.join(self.output_dir,'IMG',"{}_rmsmq_variant.png".format(sample))) rmsmqNonVariant = QCDistribution(concept="RMSMappingQualityNonVariant",typeDistribution="RMSMappingQuality",typeBaseLocation="NonVariant",pngFile=os.path.join(self.output_dir,'IMG',"{}_rmsmq_nonvariant.png".format(sample))) - gofVariant = QCDistribution(concept="GoodnessOfFitVariant",typeDistribution="GoodnessOfFit",typeBaseLocation="Variant",pngFile=os.path.join(self.output_dir,'IMG',"{}_gof_variant.png".format(sample))) - gofNonVariant = QCDistribution(concept="GoodnessOfFitNonVariant",typeDistribution="GoodnessOfFit",typeBaseLocation="NonVariant",pngFile=os.path.join(self.output_dir,'IMG',"{}_gof_nonvariant.png".format(sample))) #VCF Filter Stats vcfFilterStats = VCFFilterStats() #Mutations @@ -574,8 +560,6 @@ def buildSampleBscallReport(self,sample, lock = None): qdNonVariant.add(data["totalStats"]["QCDistributions"]["QualityByDepth"]) rmsmqVariant.add(data["totalStats"]["QCDistributions"]["RMSMappingQuality"]) rmsmqNonVariant.add(data["totalStats"]["QCDistributions"]["RMSMappingQuality"]) - gofVariant.add(data["totalStats"]["QCDistributions"]["GoodnessOfFit"]) - gofNonVariant.add(data["totalStats"]["QCDistributions"]["GoodnessOfFit"]) #VCF Filter Stats vcfFilterStats.add(data["totalStats"]["VCFFilterStats"]) #Mutations @@ -614,7 +598,7 @@ def buildSampleBscallReport(self,sample, lock = None): #DataSet Per Samples sample_stats = {"mappingCoverage": [readLevelStats,baseLevelStats,allCoverage,gcCoverage,qualityAll,nonCpGReadProfile], "calls": [totalStats,vcfFilterStats,variantCoverage,dbSnpCoverage,qualityVariant, - [fsVariant,qdVariant,qdNonVariant,rmsmqVariant,rmsmqNonVariant,gofVariant,gofNonVariant], + [fsVariant,qdVariant,qdNonVariant,rmsmqVariant,rmsmqNonVariant], mutationsStats], "methylation": [totalStats,refCpGcoverage,refCpGInfCoverage,nonRefCpGcoverage,nonRefCpGinfCoverage, qualityRefCpG,qualityNonRefCpG,plotMethylation,summaryMethylation] diff --git a/gemBS/bsCallSphinxReports.py b/gemBS/bsCallSphinxReports.py index e8cc6f11..f4c95ac3 100644 --- a/gemBS/bsCallSphinxReports.py +++ b/gemBS/bsCallSphinxReports.py @@ -375,7 +375,7 @@ def configureStats(self,stats_vector=None): stats_vector - Vector of BsCallStats to build HtmlBsGenotypeCalls [totalStats,vcfFilterStats,variantCoverage,dbSnpCoverage,qualityVariant, - [fsVariant,qdVariant,qdNonVariant,rmsmqVariant,rmsmqNonVariant,gofVariant,gofNonVariant], + [fsVariant,qdVariant,qdNonVariant,rmsmqVariant,rmsmqNonVariant], mutations] """ self.total_stats = stats_vector[0] @@ -389,8 +389,6 @@ def configureStats(self,stats_vector=None): self.qdNonVariant = stats_vector[5][2] self.rmsmqVariant = stats_vector[5][3] self.rmsmqNonVariant = stats_vector[5][4] - self.gofVariant = stats_vector[5][5] - self.gofNonVariant = stats_vector[5][6] self.mutations = stats_vector[6] @@ -455,12 +453,8 @@ def createPage(self): #5.4 GoodnessOfFit Variant NonVariant self.addSubSubSection(ident=ident,title="Phred scaled goodness of fit. Variants.") self.contents.append("\n") - self.buildImage(pathImage=self.gofVariant.relativeSphinxPathImage) - self.contents.append("\n") self.addSubSubSection(ident=ident,title="Phred scaled goodness of fit. Non-Variants.") self.contents.append("\n") - self.buildImage(pathImage=self.gofNonVariant.relativeSphinxPathImage) - self.contents.append("\n") #6. Mutation All self.addSubSection(ident=ident,title="Mutation All") self.contents.append("\n") diff --git a/gemBS/bsCallStats.py b/gemBS/bsCallStats.py index bf3648a0..1cf71021 100644 --- a/gemBS/bsCallStats.py +++ b/gemBS/bsCallStats.py @@ -1148,7 +1148,7 @@ def __init__(self,concept="",typeDistribution="FisherStrand",typeBaseLocation="V pngFile -- png plot file concept -- Plot Concept - typeDistribution -- Can be: "FisherStrand" | "QualityByDepth" | "RMSMappingQuality" | "GoodnessOfFit" + typeDistribution -- Can be: "FisherStrand" | "QualityByDepth" | "RMSMappingQuality" typeBaseLocation -- Can be: "" | "Variant" | "NonVariant" """ DistributionPlot.__init__(self,pngFile=pngFile,concept=concept) @@ -1162,9 +1162,6 @@ def __init__(self,concept="",typeDistribution="FisherStrand",typeBaseLocation="V else: self.y_legend += "%s Sites" %(typeBaseLocation) - #Number of Locations to be recovered in order to have the same axis for variants and non variants - self.gof_locations = 0 - def setAxisXLabel(self,newLabel=""): """ Updates X Label @@ -1222,13 +1219,13 @@ def selectTotalNumberLocations(self,other): else: return other_locations - def setNumberOfLocations(self,locationsToRecover): - """ - Updates the number of locations to be recovered - - locationsToRecover - New Locations Value - """ - self.gof_locations = locationsToRecover +# def setNumberOfLocations(self,locationsToRecover): +# """ +# Updates the number of locations to be recovered +# +# locationsToRecover - New Locations Value +# """ +# self.gof_locations = locationsToRecover def getUnifiedVectorToPlot(self,locationsToRecover): """From Dictionary Get Vector of Y values to be bar plotted @@ -1254,8 +1251,6 @@ def plot(self): if self.type_distribution == "FisherStrand" or self.type_distribution == "QualityByDepth": yValues = self.getVectorToPlot(cleanTail=True) - elif self.type_distribution == "GoodnessOfFit": - yValues = self.getUnifiedVectorToPlot(locationsToRecover=self.gof_locations) else: yValues = self.getVectorToPlot(cleanTail=False) @@ -1335,8 +1330,7 @@ def getTable(self): vector_table.append(["","","","","","",""]) #6.ROW FILTERED MOTIVES filtered_citerias = ["q20","qd2","q20,qd2", "fs60","q20,fs60","qd2,fs60","q20,qd2,fs60","mq40","q20,mq40","qd2,mq40","q20,qd2,mq40","fs60,mq40","q20,fs60,mq40", - "qd2,fs60,mq40","q20,qd2,fs60,mq40","gof20","q20,gof20","qd2,gof20","q20,qd2,gof20","fs60,gof20","q20,fs60,gof20","qd2,fs60,gof20","q20,qd2,fs60,gof20", - "mq40,gof20","q20,mq40,gof20","qd2,mq40,gof20","q20,qd2,mq40,gof20","fs60,mq40,gof20","q20,fs60,mq40,gof20","qd2,fs60,mq40,gof20","q20,qd2,fs60,mq40,gof20"] + "qd2,fs60,mq40","q20,qd2,fs60,mq40"] tuples_vector = [] for filtered_criteria in filtered_citerias: diff --git a/gemBS/version.py b/gemBS/version.py index 9caaa075..79279ae0 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "4" -__VERSION_SUBMINOR = "2" +__VERSION_SUBMINOR = "3" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/setup.py b/setup.py index ca3de3cd..d48e0c4b 100644 --- a/setup.py +++ b/setup.py @@ -20,10 +20,8 @@ pwd = os.path.abspath(os.path.dirname(__file__)) exec(open(os.path.join(pwd,'gemBS','version.py')).read()) -def compile_gemBS_tools(options, gsl_path, enable_cuda, disable_cuda): +def compile_gemBS_tools(options, enable_cuda, disable_cuda): make_com = 'make ' + ' '.join(options) - if gsl_path: - make_com = "BS_CALL_CONFIG=\'--with-gsl={}\' ".format(gsl_path) + make_com if disable_cuda: make_com = "GEM3_CONFIG=\'--disable-cuda\' " + make_com elif enable_cuda: @@ -163,7 +161,6 @@ class install(_install): "Perform minimal install (equivalent to --no-samtools --no-kent --no-gem3 --no-bscall)"), ('disable-cuda', None, "Do not build GPU support for GEM3 (default)"), ('enable-cuda', None, "Try to build GPU support for GEM3"), - ('gsl-path=', None, "Installation path of GSL library") ]) _install.boolean_options.extend(['no-samtools','no-kent','no-gem3','no-bscall','minimal']) @@ -175,7 +172,6 @@ def initialize_options(self): self.no_bscall = False self.disable_cuda = False self.enable_cuda = False - self.gsl_path = None _install.initialize_options(self) def run(self): @@ -193,7 +189,7 @@ def run(self): if not self.enable_cuda: self.diable_cuda = True - compile_gemBS_tools(options, self.gsl_path, self.enable_cuda, self.disable_cuda) + compile_gemBS_tools(options, self.enable_cuda, self.disable_cuda) _install.run(self) # find target folder diff --git a/tools/bs_call b/tools/bs_call index d22626b6..64aa13e6 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit d22626b649c7c3327e2e6b64e8208daeac614a11 +Subproject commit 64aa13e6d93bf1c340236a8859495c0f69fc7c90 From 72323db1ba69b4ac12a45560b32729bd5460dbb9 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sun, 15 Dec 2019 10:38:59 +0100 Subject: [PATCH 20/61] Correct warnings and remove typo --- gemBS/__init__.py | 6 +++--- tools/bs_call | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 07a20e26..620b746b 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -550,15 +550,15 @@ def mk_gembs_reference(input_name, greference, contig_md5, extra_fasta_files=Non bgzip_command.extend(['-@', str(threads)]); process = run_tools([md5_fasta,bgzip_command], name='md5_fasta', output = greference) if process.wait() != 0: - for f in [greference, md5_contig]: + for f in [greference, contig_md5]: if os.path.exists(f): os.remove(f) raise ValueError("Error while making gemBS reference") else: process = run_tools([md5_fasta], name='md5_fasta', output = None) if process.wait() != 0: - if os.path.exists(md5_contig): - os.remove(md5_contig) + if os.path.exists(contig_md5): + os.remove(contig_md5) raise ValueError("Error while making gemBS reference") process = run_tools([[executables['samtools'],'faidx',greference]], name='samtools faidx', output = None) diff --git a/tools/bs_call b/tools/bs_call index 64aa13e6..c6fd9d3d 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 64aa13e6d93bf1c340236a8859495c0f69fc7c90 +Subproject commit c6fd9d3dd75cff4def78a88db96c3e66e0b94934 From e882e8dbff289ebd9e34fbadce20c9cef35d554f Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sun, 15 Dec 2019 11:31:16 +0100 Subject: [PATCH 21/61] Add check for header in fast files --- tools/utils/md5_fasta.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/utils/md5_fasta.c b/tools/utils/md5_fasta.c index 367a7e1e..d87e2a76 100644 --- a/tools/utils/md5_fasta.c +++ b/tools/utils/md5_fasta.c @@ -190,6 +190,10 @@ static void process_file(FILE * const fp, FILE * const fout, const bool stream, MD5_Init(&ctx); tlen = 0; } else { + if(!ctg) { + fprintf(stderr,"md5_fasta:no header found"); + exit(-1); + } // First, strip characters not between 33 and 126, and convert to upper case. char *p = buf, *p1 = buf; while(*p) { From f99f65fdb9e571f47cb0587ee03c4739b03b01ec Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sun, 15 Dec 2019 20:22:54 +0100 Subject: [PATCH 22/61] Add missing keywords to parser --- gemBS/parser.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gemBS/parser.py b/gemBS/parser.py index 87e225dc..c9399dfa 100755 --- a/gemBS/parser.py +++ b/gemBS/parser.py @@ -133,12 +133,16 @@ def read(self, infile): state = 0 known_var = { - 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'reverse_conversion', 'remove_individual_bams', 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir', 'benchmark_mode', 'make_cram'), - 'index': ('index', 'index_dir', 'reference', 'extra_references', 'reference_basename', 'nonbs_index', 'contig_sizes', 'threads', 'dbsnp_files', 'dbsnp_index', 'sampling_rate', 'populate_cache'), - 'calling': ('bcf_dir', 'mapq_threshold', 'qual_threshold', 'left_trim', 'right_trim', 'threads', 'jobs', 'species', 'keep_duplicates', 'keep_improper_pairs', + 'mapping': ('tmp_dir', 'threads', 'non_stranded', 'reverse_conversion', 'remove_individual_bams', + 'underconversion_sequence', 'overconversion_sequence', 'bam_dir', 'sequence_dir', 'benchmark_mode', + 'make_cram', 'map_threads', 'sort_threads', 'merge_threads', 'sort_memory'), + 'index': ('index', 'index_dir', 'reference', 'extra_references', 'reference_basename', 'nonbs_index', 'contig_sizes', + 'threads', 'dbsnp_files', 'dbsnp_index', 'sampling_rate', 'populate_cache'), + 'calling': ('bcf_dir', 'mapq_threshold', 'qual_threshold', 'left_trim', 'right_trim', 'threads', 'jobs', 'species', + 'keep_duplicates', 'keep_improper_pairs', 'call_threads', 'merge_threads', 'remove_individual_bcfs', 'haploid', 'reference_bias', 'conversion', 'contig_list', 'contig_pool_limit', 'benchmark_mode'), - 'extract': ('extract_dir', 'jobs', 'allow_het', 'phred_threshold', 'min_inform', 'strand_specific', 'min_bc', 'make_cpg', 'make_non_cpg', 'make_bedmethyl', - 'make_bigwig', 'make_snps', 'snp_list', 'snp_db', 'reference_bias'), + 'extract': ('extract_dir', 'jobs', 'allow_het', 'phred_threshold', 'min_inform', 'strand_specific', 'min_bc', 'make_cpg', 'make_non_cpg', + 'make_bedmethyl', 'make_bigwig', 'make_snps', 'snp_list', 'snp_db', 'reference_bias'), 'report': ('project', 'report_dir', 'threads') } # Check if variables are used From 8ab2618b7056bc35c317995f3956ae3141641e49 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 16 Dec 2019 14:23:51 +0100 Subject: [PATCH 23/61] Fix exception with invalid input in configuration file and sort inputs for samtools merge step --- gemBS/__init__.py | 5 ++++- gemBS/production.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 620b746b..6133937e 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -195,7 +195,10 @@ def check(self, section, key, arg=None, default=None, boolean=False, dir_type=Fa ret = default if ret != None: if boolean: - ret = json.loads(str(ret).lower()) + try: + ret = json.loads(str(ret).lower()) + except json.decoder.JSONDecodeError: + ret = False elif int_type: ret = int(ret) elif dir_type: diff --git a/gemBS/production.py b/gemBS/production.py index a88cbcd1..ba4651f3 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -140,7 +140,7 @@ class Index(BasicPipeline): def register(self, parser): ## required parameters parser.add_argument('-t', '--threads', dest="threads", help='Number of threads. By default GEM indexer will use the maximum available on the system.',default=None) - parser.add_argument('-s', '--sampling-rate', dest="sampling_rate", help='Text sampling rate. Increasing will decrease index size at the expense of slower mapping performance.',default=None) + parser.add_argument('-s', '--sampling-rate', dest="sampling_rate", help='Text sampling rate. Increasing will decrease index size at the expense of slower performance.',default=None) parser.add_argument('-p', '--populate-cache', dest="populate_cache", help='Populate reference cache if required (for CRAM).',action="store_true",required=False,default=None) parser.add_argument('-d', '--list-dbSNP-files',dest="list_dbSNP_files",nargs="+",metavar="FILES", help="List of dbSNP files (can be compressed) to create an index to later use it at the bscall step. The bed files should have the name of the SNP in column 4.",default=[]) @@ -628,6 +628,7 @@ def do_mapping(self, fli): def do_merge(self, sample, inputs, fname): if inputs: + inputs.sort() self.db.isolation_level = None c = self.db.cursor() try_get_exclusive(c) @@ -645,7 +646,7 @@ def do_merge(self, sample, inputs, fname): c.execute("COMMIT") # Register output files and db cleanup in case of failure odir = os.path.dirname(outfile) - ixfile = os.path.join(odir, smp + '.bai') + ixfile = os.path.join(odir, smp + '.csi') md5file = outfile + '.md5' database.reg_db_com(outfile, "UPDATE mapping SET status = 0 WHERE filepath = '{}'".format(outfile), [outfile, ixfile, md5file]) if self.dry_run or self.dry_run_json: From d19623ad98dada2a7829e8f32c8b83c014f87d69 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Thu, 19 Dec 2019 06:54:37 +0100 Subject: [PATCH 24/61] Sort list of bcf files before running bcftools concat --- gemBS/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 6133937e..1b4394bf 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -1344,6 +1344,7 @@ def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None,benchmark_mo concat.extend(['--threads', threads]) if benchmark_mode: concat.append('--no-version') + list_bcfs.sort() concat.extend(list_bcfs) process = run_tools([concat],name="Concatenation Calls",logfile=logfile) From 6d7a8ab25c2c44e6c6cca1485bba5b5fbaafc88f Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Thu, 19 Dec 2019 08:28:41 +0100 Subject: [PATCH 25/61] Add / modify sort steps to ensure reproducibility for VCF/BCF generation --- README.md | 11 +++++++---- gemBS/database.py | 4 ++-- gemBS/version.py | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 471573dc..b364e6b4 100644 --- a/README.md +++ b/README.md @@ -75,17 +75,20 @@ Documentation can be found at ---------- Changelog: ---------- + 3.4.4 Sort input bcf files to bcftools concat stage to ensure reproducibility. + 3.4.4 Add extra sort keys when generating pools to ensure stability of pool membership in the event of multiple contigs + having the same size 3.4.3 Remove calculation of the goodness of filter (GOF) as this is expensive, non-standard and unreliable. Removing this - removes the dependency on GSL. - 3.4.3 Add autodetection of output format to bs_call (unless explicitly specified on the command line) + removes the dependency on GSL. + 3.4.3 Add autodetection of output format to bs_call (unless explicitly specified on the command line) 3.4.2 Add CRAM support (via make_cram option in configuration file) 3.4.1 Add benchmark-mode that does not write date or program version numbers into SAM/BAM or VCF/BCF files - Switch to samtools, bcftools and htslib v1.10 + Switch to samtools, bcftools and htslib v1.10 3.4.0 Move to new bs_call version (2.1.0) which is more efficient in memory use and can read BAMs and write BCFs natively. The new bs_call requires a faidx indexed reference, so gemBS no creates this during indexing. - 3.4.0 Add switches to give more control to threads and memory + 3.4.0 Add switches to give more control to threads and memory usage in mapping and calling stages 3.3.3 Remove legacy pathway for config files with no header line (fix issue 'error in gemBS index #65) 3.3.2 Fix error where header line for wig files could be omitted diff --git a/gemBS/database.py b/gemBS/database.py index 3506f2d6..140d0ed6 100644 --- a/gemBS/database.py +++ b/gemBS/database.py @@ -402,8 +402,8 @@ def check_contigs(self, sync = False): while pname(ix) in pools_used: ix += 1 pools.append([pname(ix), [], 0]) ix += 1 - for ctg in sorted(small_contigs, key = lambda x: -contig_size[x]): - pl = sorted(pools, key = lambda x: x[2])[0] + for ctg in sorted(small_contigs, key = lambda x: (-contig_size[x], ctg)): + pl = sorted(pools, key = lambda x: (x[2], x[0]))[0] sz = contig_size[ctg] pl[1].append(ctg) pl[2] = pl[2] + sz diff --git a/gemBS/version.py b/gemBS/version.py index 79279ae0..306c9ba5 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "4" -__VERSION_SUBMINOR = "3" +__VERSION_SUBMINOR = "4" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) From db12bb9494642b0c71e9249391c436bd0f416ff8 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 20 Jan 2020 06:42:10 +0100 Subject: [PATCH 26/61] Initial commit of new mextr version as stand-along parallel utility --- tools/Makefile | 54 +- tools/utils/{Makefile => Makefile.in} | 16 +- tools/utils/bbi.c | 631 ++++++ tools/utils/bbi.h | 77 + tools/utils/bbi_defs.h | 26 + tools/utils/bbi_structs.h | 142 ++ tools/utils/calc_gt_prob.c | 327 +++ tools/utils/command_line.c | 375 ++++ tools/utils/configure | 2847 +++++++++++++++++++++++++ tools/utils/configure.ac | 18 + tools/utils/files.c | 173 ++ tools/utils/init_params.c | 56 + tools/utils/mextr.c | 100 + tools/utils/mextr.h | 216 ++ tools/utils/output.c | 850 ++++++++ tools/utils/output_headers.c | 90 + tools/utils/output_utils.c | 51 + tools/utils/rec.c | 56 + tools/utils/stats.c | 32 + tools/utils/unpack.c | 276 +++ 20 files changed, 6368 insertions(+), 45 deletions(-) rename tools/utils/{Makefile => Makefile.in} (68%) create mode 100644 tools/utils/bbi.c create mode 100644 tools/utils/bbi.h create mode 100644 tools/utils/bbi_defs.h create mode 100644 tools/utils/bbi_structs.h create mode 100644 tools/utils/calc_gt_prob.c create mode 100644 tools/utils/command_line.c create mode 100755 tools/utils/configure create mode 100644 tools/utils/configure.ac create mode 100644 tools/utils/files.c create mode 100644 tools/utils/init_params.c create mode 100644 tools/utils/mextr.c create mode 100644 tools/utils/mextr.h create mode 100644 tools/utils/output.c create mode 100644 tools/utils/output_headers.c create mode 100644 tools/utils/output_utils.c create mode 100644 tools/utils/rec.c create mode 100644 tools/utils/stats.c create mode 100644 tools/utils/unpack.c diff --git a/tools/Makefile b/tools/Makefile index fbb26b71..1619c38f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -12,43 +12,27 @@ ROOT_PATH=$(CURDIR) # samtools and bcftools definitions SAMTOOLS_VERSION=1.10 BCFTOOLS_VERSION=1.10 -UCSCTOOLS_VERSION=v378 SAMTOOLS_DIR=samtools BCFTOOLS_DIR=bcftools -UCSCTOOLS_DIR=userApps SAMTOOLS=$(SAMTOOLS_DIR)/samtools BCFTOOLS=$(BCFTOOLS_DIR)/bcftools SAMTOOLS_TAR=samtools-$(SAMTOOLS_VERSION).tar.bz2 BCFTOOLS_TAR=bcftools-$(BCFTOOLS_VERSION).tar.bz2 -UCSCTOOLS_TAR=userApps.$(UCSCTOOLS_VERSION).src.tgz SAMTOOLS_URL=https://github.com/samtools/samtools/releases/download/$(SAMTOOLS_VERSION)/$(SAMTOOLS_TAR) BCFTOOLS_URL=https://github.com/samtools/bcftools/releases/download/$(BCFTOOLS_VERSION)/$(BCFTOOLS_TAR) -UCSCTOOLS_URL=http://hgdownload.cse.ucsc.edu/admin/exe/$(UCSCTOOLS_TAR) MACHTYPE:=$(shell uname -m) ifneq (,$(findstring -,$(MACHTYPE))) MACHTYPE:=$(shell uname -m) endif - -UCSCTOOLS_LIB=$(UCSCTOOLS_DIR)/kent/src/lib/$(MACHTYPE)/jkweb.a -UCSCTOOLS_HTSLIB=$(UCSCTOOLS_DIR)/kent/src/htslib/libhts.a - -#UNAME := $(shell uname -s) -#ifeq ($(UNAME),Linux) -# ARCHIVE=http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64.v369 -#endif -#ifeq ($(UNAME),Darwin) -# ARCHIVE=http://hgdownload.cse.ucsc.edu/admin/exe/macOSX.x86_64 -#endif - FOLDER_BIN=bin -all: setup gem3 _samtools _bcftools kent _bs_call _utils +all: setup gem3 _samtools _bcftools _bs_call _utils -all_static: setup gem3-static _samtools _bcftools kent _bs_call _utils +all_static: setup gem3-static _samtools _bcftools _bs_call _utils -all_debug: setup gem3-debug _samtools _bcftools kent _bs_call _utils +all_debug: setup gem3-debug _samtools _bcftools _bs_call _utils _samtools: $(SAMTOOLS_DIR) $(SAMTOOLS_DIR)/config.mk $(MAKE) --directory=$(SAMTOOLS_DIR) all all-htslib @@ -56,14 +40,15 @@ _samtools: $(SAMTOOLS_DIR) $(SAMTOOLS_DIR)/config.mk _bcftools: $(BCFTOOLS_DIR) $(BCFTOOLS_DIR)/config.h $(MAKE) $(FOLDER_BIN)/bcftools -kent: $(FOLDER_BIN)/wigToBigWig $(FOLDER_BIN)/bedToBigBed - _bs_call: bs_call/src/Makefile.mk $(MAKE) --directory=bs_call -_utils: +_utils: utils/Makefile $(MAKE) --directory=utils +utils/Makefile: utils/Makefile.in utils/configure _samtools + cd utils; ./configure --with-htslib=../${SAMTOOLS_DIR}/htslib-${SAMTOOLS_VERSION} + setup: @mkdir -p $(FOLDER_BIN) @@ -100,41 +85,24 @@ $(BCFTOOLS_DIR): wget $(BCFTOOLS_URL) && tar -jxf $(BCFTOOLS_TAR) && rm -f $(BCFTOOLS_TAR) mv bcftools-$(BCFTOOLS_VERSION) $(BCFTOOLS_DIR) -$(UCSCTOOLS_DIR): - wget $(UCSCTOOLS_URL) && tar -zxf $(UCSCTOOLS_TAR) && rm -f $(UCSCTOOLS_TAR) - echo BINDIR=$(CURDIR)/bin > $(UCSCTOOLS_DIR)/kent/src/inc/localEnvironment.mk - -$(UCSCTOOLS_LIB): $(UCSCTOOLS_DIR) - cd $(UCSCTOOLS_DIR)/kent/src/lib && make - -$(UCSCTOOLS_HTSLIB): $(UCSCTOOLS_DIR) - cd $(UCSCTOOLS_DIR)/kent/src/htslib && make - $(BCFTOOLS_DIR)/plugins/%.c: ln -sf ../../gemBS_plugins/$(notdir $@) $(BCFTOOLS_DIR)/plugins/ ln -sf ../../gemBS_plugins/$(basename $(notdir $@)).mk $(BCFTOOLS_DIR)/plugins/ -$(FOLDER_BIN)/bedToBigBed: $(UCSCTOOLS_LIB) $(UCSCTOOLS_HTSLIB) - cd $(UCSCTOOLS_DIR)/kent/src/utils/bedToBigBed && make - -$(FOLDER_BIN)/wigToBigWig: $(UCSCTOOLS_LIB) $(UCSCTOOLS_HTSLIB) - cd $(UCSCTOOLS_DIR)/kent/src/utils/wigToBigWig && make - -ucsctools_clean: - if [ -d $(UCSCTOOLS_DIR) ]; then $(MAKE) --directory=$(UCSCTOOLS_DIR)/kent/src/lib clean; $(MAKE) --directory=$(UCSCTOOLS_DIR)/kent/src/htslib clean; $(MAKE) --directory=$(UCSCTOOLS_DIR)/kent/src/utils/bedToBigBed clean; $(MAKE) --directory=$(UCSCTOOLS_DIR)/kent/src/utils/wigToBigWig clean; fi - -clean: ucsctools_clean +clean: @rm -f *~ @rm -rf $(FOLDER_BIN) if [ -f $(SAMTOOLS_DIR)/Makefile ]; then $(MAKE) --directory=$(SAMTOOLS_DIR) clean; fi if [ -f $(BCFTOOLS_DIR)/Makefile ]; then $(MAKE) --directory=$(BCFTOOLS_DIR) clean; fi if [ -f "gem3-mapper/Makefile.mk" ]; then $(MAKE) --directory=gem3-mapper clean; fi if [ -f "bs_call/src/Makefile.mk" ]; then $(MAKE) --directory=bs_call clean; fi + if [ -f "utils/Makefile" ]; then $(MAKE) --directory=utils clean; fi -distclean: ucsctools_clean +distclean: @rm -f *~ @rm -rf $(FOLDER_BIN) if [ -f $(SAMTOOLS_DIR)/Makefile ]; then cd $(SAMTOOLS_DIR); $(MAKE) clean; rm -f config.h config.log config.status config.mk; fi if [ -f $(BCFTOOLS_DIR)/Makefile ]; then cd $(BCFTOOLS_DIR); $(MAKE) clean; rm -f config.h config.log config.status config.mk; fi if [ -f "gem3-mapper/Makefile.mk" ]; then $(MAKE) --directory=gem3-mapper distclean; fi if [ -f "bs_call/src/Makefile.mk" ]; then $(MAKE) --directory=bs_call distclean; fi + if [ -f "utils/Makefile" ]; then $(MAKE) --directory=utils distclean; fi diff --git a/tools/utils/Makefile b/tools/utils/Makefile.in similarity index 68% rename from tools/utils/Makefile rename to tools/utils/Makefile.in index 6698d071..349ab8a9 100644 --- a/tools/utils/Makefile +++ b/tools/utils/Makefile.in @@ -13,13 +13,18 @@ CC=gcc ROOT_PATH=.. -TOOLS=gemBS_cat readNameClean md5_fasta +TOOLS=gemBS_cat readNameClean md5_fasta mextr FOLDER_BIN=../bin TOOLS_BIN=$(addprefix $(FOLDER_BIN)/, $(TOOLS)) LIBS:= -lm +MEXTR_INC = @HTSINC@ +MEXTR_LIBS = @HTSLIBS@ -lz -lbz2 -lpthread $(LIBS) -all: TOOLS_FLAGS=-O3 $(GENERAL_FLAGS) $(ARCH_FLAGS) $(SUPPRESS_CHECKS) $(OPTIMIZTION_FLAGS) $(ARCH_FLAGS_OPTIMIZTION_FLAGS) +MEXTR_SRC=mextr.c calc_gt_prob.c output.c output_utils.c output_headers.c command_line.c init_params.c files.c \ + stats.c unpack.c rec.c bbi.c + +all: TOOLS_FLAGS=-O3 -g $(GENERAL_FLAGS) $(ARCH_FLAGS) $(SUPPRESS_CHECKS) $(OPTIMIZTION_FLAGS) $(ARCH_FLAGS_OPTIMIZTION_FLAGS) all: $(TOOLS_BIN) static: TOOLS_FLAGS=-O3 $(GENERAL_FLAGS) $(ARCH_FLAGS) $(SUPPRESS_CHECKS) $(OPTIMIZTION_FLAGS) $(ARCH_FLAGS_OPTIMIZTION_FLAGS) -static @@ -31,6 +36,9 @@ debug: $(TOOLS_BIN) clean: rm -f *~ *.o *.a +distclean: clean + rm -f Makefile config.status + utils.o: utils.c utils.h $(CC) $(TOOLS_FLAGS) -c utils.c @@ -42,3 +50,7 @@ $(FOLDER_BIN)/md5_fasta: md5_fasta.c utils.o $(FOLDER_BIN)/readNameClean: readNameClean.c utils.o $(CC) $(TOOLS_FLAGS) -o $@ readNameClean.c utils.o $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) + +$(FOLDER_BIN)/mextr: $(MEXTR_SRC) mextr.h bbi.h bbi_defs.h bbi_structs.h utils.o + $(CC) $(TOOLS_FLAGS) -o $@ $(MEXTR_SRC) utils.o $(MEXTR_INC) $(MEXTR_LIBS) + diff --git a/tools/utils/bbi.c b/tools/utils/bbi.c new file mode 100644 index 00000000..c702d142 --- /dev/null +++ b/tools/utils/bbi.c @@ -0,0 +1,631 @@ +/* + * bbi.c + * + * Created on: Jan 7, 2020 + * Author: heath + * + * Routines to write bigBed and bigWig files from the bedmethyl and wig files + * generated for the standard ENCODE analysis pipeline. + * + * This is not and is not intended to be a general purpose library for writing + * bigBed and bigWig files, and is instead a specialized set of routines that take + * advantage of the peculiarities of the data (all ranges of size 1, most cytosines + * not methylated etc.) to increase speed and memory efficiency. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "htslib/khash_str2int.h" +#include "mextr.h" +#include "bbi.h" + +static const char *autosql_desc = + "table BisulfiteSeq\n" + "\"BED9+5 scores for bisulfite-seq data\"\n" + "\t(\n" + "\tstring\tchrom;\t\"Reference chromosome or scaffold\"\n" + "\tuint\tchromStart;\t\"Start position in chromosome\"\n" + "\tuint\tchromEnd;\t\"End position in chromosome\"\n" + "\tstring\tname;\t\"Name of item\"\n" + "\tuint\tscore;\t\"Score from 0-1000. Capped number of reads\"\n" + "\tchar[1]\tstrand;\t\"+ or - or . for unknown\"\n" + "\tuint\tthickStart;\t\"Start of where display should be thick (start codon)\"\n" + "\tuint\tthickEnd;\t\"End of where display should be thick (stop codon)\"\n" + "\tuint\treserved;\t\"Color value R,G,B\"\n" + "\tuint\treadCount;\t\"Number of reads or coverage\"\n" + "\tuint\tpercentMeth;\t\"Percentage of reads that show methylation at this position in the genome\"\n" + "\tstring\trefContext;\t\"Reference context on strand (2 bases for CpG, 3 bases for CHG, CHH)\"\n" + "\tstring\tcalledContext;\t\"Called context on strand (2 bases for CpG, 3 bases for CHG, CHH)\"\n" + "\tuint\tgenotypeQual;\t\"Phred score for genotype call\"\n" + "\t)\n"; + +void write_bbi_header(FILE * const fp, bbi_header_t * const header, bbi_global_data_t * const bdata) { + assert(fp); + fseek(fp, 0, SEEK_SET); + bbi_write(fp, header->magic); + bbi_write(fp, header->version); + bbi_write(fp, header->zoomLevels); + bbi_write(fp, header->chromosomeTreeOffset); + bbi_write(fp, header->fullDataOffset); + bbi_write(fp, header->fullIndexOffset); + bbi_write(fp, header->fieldCount); + bbi_write(fp, header->definedFieldCount); + bbi_write(fp, header->autoSqlOffset); + bbi_write(fp, header->totalSummaryOffset); + bbi_write(fp, header->uncompressBufSize); + bbi_write(fp, header->extensionOffset); + // zoomHeaders + uint32_t res = 0; + for(int i = 0; i < ZOOM_LEVELS; i++) { + bbi_write(fp, bdata->zoom_scales[i]); + bbi_write(fp, res); + bbi_write(fp, bdata->zoom_data_offset[i]); + bbi_write(fp, bdata->zoom_index_offset[i]); + } + // Add autoSql string + if(header->autoSqlOffset) { + fseek(fp, header->autoSqlOffset, SEEK_SET); + fwrite(autosql_desc, 1, 1 + strlen(autosql_desc), fp); + } + // And total summary + bbi_write(fp, bdata->total_bases); + bbi_write(fp, bdata->min_x); + bbi_write(fp, bdata->max_x); + bbi_write(fp, bdata->sum_x); + bbi_write(fp, bdata->sum_xsq); + // And extended header + fseek(fp, header->extensionOffset, SEEK_SET); + uint16_t ext_size = 64; + bbi_write(fp, ext_size); + for(int i = 0; i < 62; i++) fputc(0, fp); +} + +static int ctg_id_lookup(const tree_t * tree, const int i, const int level) { + const int j = ((const int * const)tree->start[level])[i]; + return level > 0 ? ctg_id_lookup(tree, j, level - 1) : j; +} + +void write_contig_tree_level(FILE *fp, const contig_tree_t *ctree, const int level) { + uint64_t pos = ftell(fp); + const tree_t * tree = &ctree->tree; + char * const keybuf = malloc(ctree->key_len); + const int n_nodes = tree->width[level]; // Number of nodes at this level + const uint64_t item_size = ctree->key_len + 8; // item size is the same for leaf and non-leaf nodes + const int * const start_ix = tree->start[level]; + if(level > 0) { + // Non-leaf nodes + const int n_nodes1 = tree->width[level - 1]; // Number of nodes at next (lower) level + uint64_t off = pos + 4 * n_nodes + item_size * n_nodes1; // Offset of first node at next (lower) level + const int * const start_ix1 = tree->start[level - 1]; + uint16_t zero = 0; + for(int i = 0; i < n_nodes; i++) { + uint16_t n_items = start_ix[i+1] - start_ix[i]; + bbi_write(fp, zero); + bbi_write(fp, n_items); + for(int j = start_ix[i]; j < start_ix[i + 1]; j++) { + // Look up chromosome name corresponding to item position + uint32_t id = ctg_id_lookup(tree, j, level - 1); + strncpy(keybuf, ctree->names[(int)id], ctree->key_len); + fwrite(keybuf, 1, ctree->key_len, fp); + bbi_write(fp, off); + off += 4 + item_size * (start_ix1[j + 1] - start_ix1[j]); + } + } + } else { + // Leaf nodes + for(int i = 0; i < n_nodes; i++) { + uint8_t isLeaf = 1; + uint8_t res = 0; + uint16_t n_items = start_ix[i + 1] - start_ix[i]; + bbi_write(fp, isLeaf); + bbi_write(fp, res); + bbi_write(fp, n_items); + for(int j = start_ix[i]; j < start_ix[i + 1]; j++) { + strncpy(keybuf, ctree->names[j], ctree->key_len); + uint32_t id = j; + uint32_t csize = ctree->len[id]; + fwrite(keybuf, 1, ctree->key_len, fp); + bbi_write(fp, id); + bbi_write(fp, csize); + } + } + } + free(keybuf); +} + +void write_contig_tree(FILE * const fp, bbi_header_t * const header, const contig_tree_t * const ctree) { + assert(fp); + fseek(fp, header->chromosomeTreeOffset, SEEK_SET); + // Write chromosome B+ tree header + uint32_t magic = 0x78CA8C91; + uint32_t valSize = 8; + uint64_t reserved = 0; + bbi_write(fp, magic); + bbi_write(fp, ctree->tree.block_size); + bbi_write(fp, ctree->key_len); + bbi_write(fp, valSize); + bbi_write(fp, ctree->tree.n_items); + bbi_write(fp, reserved); + // Write contig tree levels + for(int i = ctree->tree.depth - 1; i >= 0; i--) write_contig_tree_level(fp, ctree, i); +} + +void write_r_tree_level(FILE *fp, const r_tree_t *rtree, const int level) { + uint64_t pos = ftell(fp); + const tree_t * tree = &rtree->tree; + const int n_nodes = tree->width[level]; // Number of nodes at this level + const uint64_t leaf_item_size = 32; + const uint64_t non_leaf_item_size = 24; + const r_node_t * const rn = tree->start[level]; + if(level > 0) { + // Non-leaf nodes + const int n_nodes1 = tree->width[level - 1]; // Number of nodes at next (lower) level + uint64_t off = pos + 4 * n_nodes + non_leaf_item_size * n_nodes1; // Offset of first node at next (lower) level + const r_node_t * const rn1 = tree->start[level - 1]; + uint16_t zero = 0; + int item_size1 = level > 1 ? non_leaf_item_size : leaf_item_size; + for(int i = 0; i < n_nodes; i++) { + uint16_t n_items = rn[i + 1].start_idx - rn[i].start_idx; + bbi_write(fp, zero); + bbi_write(fp, n_items); + for(int j = rn[i].start_idx; j < rn[i + 1].start_idx; j++) { + bbi_write(fp, rn1[j].start_ctg); + bbi_write(fp, rn1[j].start_base); + bbi_write(fp, rn1[j].end_ctg); + bbi_write(fp, rn1[j].end_base); + bbi_write(fp, off); + off += 4 + item_size1 * (rn1[j + 1].start_idx - rn1[j].start_idx); + } + } + } else { + // Leaf nodes + r_tree_block_t * const rbp = rtree->blocks; + for(int i = 0; i < n_nodes; i++) { + uint8_t isLeaf = 1; + uint8_t res = 0; + uint16_t n_items = rn[i + 1].start_idx - rn[i].start_idx; + bbi_write(fp, isLeaf); + bbi_write(fp, res); + bbi_write(fp, n_items); + for(int j = rn[i].start_idx; j < rn[i + 1].start_idx; j++) { + bbi_write(fp, rbp[j].ctg); + bbi_write(fp, rbp[j].block->start); + bbi_write(fp, rbp[j].ctg); + bbi_write(fp, rbp[j].block->end); + bbi_write(fp, rbp[j].block->offset); + uint64_t size = (j + 1 < tree->n ? rbp[j + 1].block->offset : rtree->end_offset) - rbp[j].block->offset; + bbi_write(fp, size); + } + } + } +} + +void write_r_tree(args_t const * args, const int ix, r_tree_t * const rtree) { + FILE * const fp = ix < 3 ? args->bigbedfiles[ix] : args -> bigwigfiles[ix - 3]; + const uint32_t magic = 0x2468ACE0; + bbi_write(fp, magic); + bbi_write(fp, rtree->tree.block_size); + bbi_write(fp, rtree->tree.n_items); + r_node_t * const root_node = rtree->tree.start[rtree->tree.depth - 1]; + bbi_write(fp, root_node->start_ctg); + bbi_write(fp, root_node->start_base); + bbi_write(fp, root_node->end_ctg); + bbi_write(fp, root_node->end_base); + bbi_write(fp, args->bb_global[ix].index_offset); + const uint32_t items_per_slot = ix < 3 ? ITEMS_PER_SLOT : BW_ITEMS_PER_SLOT; + const uint32_t res = 0; + bbi_write(fp, items_per_slot); + bbi_write(fp, res); + // Write R tree levels + for(int i = rtree->tree.depth - 1; i >= 0; i--) write_r_tree_level(fp, rtree, i); +} + +void calc_start_ix(tree_t * const tree, const int level, const bool rtree, const r_tree_block_t * const rbp) { + // For higher level nodes we balance node sizes across the tree, but for the level 1 nodes + // (one above the leaves) they all have to have the same number of entries (block_size) apart + // from the last one which normally has less. This is to allow the reader to quickly go from + // chromosome ID to the key + int w = tree->width[level]; + int *start_ix = NULL; + r_node_t *rn = NULL, *rn1 = NULL; + if(rtree) { + rn = tree->start[level]; + if(level > 0) rn1 = tree->start[level - 1]; + } + else start_ix = tree->start[level]; + int w1 = level > 0 ? tree->width[level - 1] : tree->n; + if(rtree || level > 0) { + int k = w1 / w; + int o1 = w * k - w1; + int o2 = w * (k + 1) - w1; + int d = 0; + int off = 0; + int sz; + for(int i = 0; i < w; i++) { + if(abs(d + o1) < abs(d + o2)) { + d += o1; + sz = k; + } else { + d += o2; + sz = k + 1; + } + if(rtree) { + rn[i].start_idx = off; + if(rn1) { + rn[i].start_ctg = rn1[off].start_ctg; + rn[i].start_base = rn1[off].start_base; + rn[i].end_ctg = rn1[off + sz - 1].end_ctg; + rn[i].end_base = rn1[off + sz -1].end_base; + } else { + rn[i].start_ctg = rbp[off].ctg; + rn[i].start_base = rbp[off].block->start; + rn[i].end_ctg = rbp[off + sz - 1].ctg; + rn[i].end_base = rbp[off + sz - 1].block->end; + } + } else start_ix[i] = off; + off += sz; + } + assert(off == w1); + } else for(int i = 0; i < w; i++) start_ix[i] = i * tree->block_size; + if(rtree) rn[w].start_idx = w1; + else start_ix[w] = w1; +} + +void set_tree_widths(tree_t * const tree, uint32_t const block_size, size_t item_size, const bool rtree, const r_tree_block_t * const rbp) { + uint64_t n1 = tree->n; + while(true) { + hts_resize(int, tree->depth + 1, &tree->wsize, &tree->width, 0); + n1 = (n1 + BLOCK_SIZE - 1) / BLOCK_SIZE; + tree->width[tree->depth++] = n1; + if(n1 <= 1) break; + } + tree->block_size = tree->depth > 1 ? block_size : tree->n; + tree->start = calloc(tree->depth, sizeof(void *)); + for(int i = 0; i < tree->depth; i++) { + tree->start[i] = malloc(item_size * (tree->width[i] + 1)); + calc_start_ix(tree, i, rtree, rbp); + } +} + +contig_tree_t *init_contig_tree(args_t * const args) { + contig_tree_t * const ctree = calloc(1, sizeof(contig_tree_t)); + // Pick up contig names from the selected regions + // These are already in alphabetical order and filtered so that + // only contigs contained in the BCF/VCF header are present + bcf_sr_regions_t * const reg = args->sr->regions; + const int n = ctree->tree.n = ctree->tree.n_items = reg->nseqs; + assert(n > 0); + ctree->len = malloc(n * sizeof(uint64_t)); + ctree->names = reg->seq_names; + int max = 0; + for(int i = 0; i < n; i++) { + const int j = bcf_hdr_name2id(args->hdr, ctree->names[i]); + assert(j >= 0); + const bcf_idpair_t * const idp = args->hdr->id[BCF_DT_CTG] + j; + ctree->len[i] = idp->val->info[0]; + int s = (int)strlen(ctree->names[i]); + if(s > max) max = s; + } + ctree->key_len = max; + tree_t * const tree = &ctree->tree; + set_tree_widths(tree, BLOCK_SIZE, sizeof(int), false, NULL); + return ctree; +} + +r_tree_t *init_r_tree(args_t * const args, const int ix, const uint32_t n_items, uint64_t data_end){ + r_tree_t * const rtree = calloc(1, sizeof(r_tree_t)); + bcf_sr_regions_t * const reg = args->sr->regions; + rtree->nctgs = reg->nseqs; + rtree->end_offset = data_end; + tree_t * const tree = &rtree->tree; + uint64_t nb = 0; + for(int i = 0; i < rtree->nctgs; i++) nb += args->ctg_data[i].bbi_data[ix].block_idx; + tree->n = nb; + tree->n_items = n_items; + rtree->blocks = malloc(sizeof(r_tree_block_t) * nb); + nb = 0; + r_tree_block_t *rbp = rtree->blocks; + for(int i = 0; i < rtree->nctgs; i++) { + bbi_data_t * const bbd = args->ctg_data[i].bbi_data + ix; + for(int j = 0; j < bbd->block_idx; j++, rbp++) { + rbp->block = bbd->blocks + j; + rbp->ctg = i; + } + } + set_tree_widths(tree, BLOCK_SIZE, sizeof(r_node_t), true, rtree->blocks); + return rtree; +} + +void destroy_tree(tree_t * const tree) { + if(tree->width) free(tree->width); + if(tree->start) { + for(int i = 0; i < tree->depth; i++) free(tree->start[i]); + free(tree->start); + } +} + +void destroy_contig_tree(contig_tree_t * const ctree) { + destroy_tree(&ctree->tree); + free(ctree); +} + +void destroy_r_tree_and_data(args_t const * args, const int ix, r_tree_t * const rtree) { + destroy_tree(&rtree->tree); + if(rtree->blocks) free(rtree->blocks); + for(int i = 0; i < rtree->nctgs; i++) { + bbi_data_t * const bbd = args->ctg_data[i].bbi_data + ix; + if(bbd->blocks) free(bbd->blocks); + bbd->block_idx = bbd->block_sz = bbd->n_items = 0; + bbd->blocks = NULL; + memset(&bbd->bbuf, 0, sizeof(bbd->bbuf)); + } + free(rtree); + +} + +void _init_bbi_header(bbi_header_t * const header, const bool bigbed) { + memset(header, 0, sizeof(bbi_header_t)); + header->magic = bigbed ? 0x8789F2EB : 0x888FFC26; + header->version = 4; + header->zoomLevels = ZOOM_LEVELS; + header->totalSummaryOffset = BBI_HEADER_SIZE + header->zoomLevels * ZOOM_HEADER_SIZE; + if(bigbed) { + header->fieldCount = 14; + header->definedFieldCount = 0; + header->autoSqlOffset = header->totalSummaryOffset; + header->totalSummaryOffset += strlen(autosql_desc) + 1; + } + header->extensionOffset = header->totalSummaryOffset + TOTAL_SUMMARY_SIZE; + header->chromosomeTreeOffset = header->extensionOffset + EXT_HEADER_SIZE; +} + +void init_bbi_header(args_t * const args, const bool bigbed) { + const int ix = bigbed ? 0 : 1; + args->bbi_hdr[ix] = malloc(sizeof(bbi_header_t)); + _init_bbi_header(args->bbi_hdr[ix], bigbed); + contig_tree_t *ctree = init_contig_tree(args); + args->ctg_data = calloc(ctree->tree.n, sizeof(bbi_ctg_data_t)); + bbi_ctg_data_t * bdata = args->ctg_data; + for(int i = 0; i < ctree->tree.n; i++, bdata++) { + bdata->zoom_data.base_type = calloc(1, (ctree->len[i] + 1) >> 1); + bdata->zoom_data.len = ctree->len[i]; + } + if(bigbed) { + for(int i = 0; i < 3; i++) write_contig_tree(args->bigbedfiles[i], args->bbi_hdr[0], ctree); + args->bbi_hdr[0]->fullDataOffset = ftell(args->bigbedfiles[0]); + for(int i = 0; i < 3; i++) fseek(args->bigbedfiles[i], args->bbi_hdr[0]->fullDataOffset + 4, SEEK_SET); + } else { + const int j = args->strand_specific ? 2 : 1; + for(int i = 0; i < j; i++) write_contig_tree(args->bigwigfiles[i], args->bbi_hdr[1], ctree); + args->bbi_hdr[1]->fullDataOffset = ftell(args->bigwigfiles[0]); + for(int i = 0; i < j; i++) fseek(args->bigwigfiles[i], args->bbi_hdr[1]->fullDataOffset + 4, SEEK_SET); + } + destroy_contig_tree(ctree); +} + +void *bbi_write_thread(void *p) { + uint32_t idx = 0; + args_t * const args = p; + cblock_buffer_t * const cbuf = &args->cblock_buf; + const int nb = cbuf->n_cblocks; + for(;;) { + pthread_mutex_lock(&cbuf->mut); + for(;;) { + if((cbuf->cblocks[idx].state == cblock_compressed) || (cbuf->end_of_input && cbuf->cblocks[idx].state == cblock_empty)) break; + pthread_cond_wait(&cbuf->cond[2], &cbuf->mut); + } + bbi_cblock_t * const cb = cbuf->cblocks + idx; + if(cb->state != cblock_compressed) { + pthread_mutex_unlock(&cbuf->mut); + break; + } + const int ix = cb->ix; + FILE * const fp = ix < 3 ? args->bigbedfiles[ix] : args->bigwigfiles[ix - 3]; + args->ctg_data[cb->ctg_id].bbi_data[ix].blocks[cb->block_idx].offset = ftell(fp); + pthread_mutex_unlock(&cbuf->mut); + kstring_t * buf = cb->buf_p; + fwrite(ks_str(buf), 1, ks_len(buf), fp); + cb->state = cblock_empty; + pthread_cond_signal(&cbuf->cond[0]); + idx = (idx + 1) % nb; + } + pthread_cond_signal(&cbuf->cond[0]); + return NULL; +} + +void *bbi_compress_thread(void *p) { + uint32_t idx = 0; + args_t * const args = p; + void *comp_buf = NULL; + size_t comp_buf_size = 0; + cblock_buffer_t * const cbuf = &args->cblock_buf; + const int nb = cbuf->n_cblocks; + for(;;) { + pthread_mutex_lock(&cbuf->mut); + for(;;) { + for(int i = 0; i < nb; i++, idx = (idx + 1) % nb) if(cbuf->cblocks[idx].state == cblock_uncompressed) break; + if(cbuf->cblocks[idx].state == cblock_uncompressed || cbuf->end_of_input) break; + pthread_cond_wait(&cbuf->cond[1], &cbuf->mut); + } + bbi_cblock_t * const cb = cbuf->cblocks + idx; + if(cb->state != cblock_uncompressed) { + pthread_mutex_unlock(&cbuf->mut); + break; + } + cb->state = cblock_active; + pthread_mutex_unlock(&cbuf->mut); + kstring_t * buf = cb->buf_p; + const int ix = cb->ix; + uLong req_size = compressBound((uLong) ks_len(buf)); + if(req_size > comp_buf_size) { + comp_buf_size = req_size * 1.2; + comp_buf = realloc(comp_buf, comp_buf_size); + } + if(ks_len(buf) > args->bb_global[ix].max_buf_size) args->bb_global[ix].max_buf_size = ks_len(buf); + uLongf compress_size = comp_buf_size; + int ret = compress((Bytef *)comp_buf, &compress_size, (Bytef *)ks_str(buf), (uLong)ks_len(buf)); + if(ret != 0) error("Failed to compress data block\n"); + ks_resize(buf, (size_t)compress_size); + memcpy(buf->s, comp_buf, (size_t)compress_size); + buf->l = (size_t)compress_size; + cb->state = cblock_compressed; + pthread_cond_signal(&cbuf->cond[2]); + idx = (idx + 1) % nb; + } + if(comp_buf) free(comp_buf); + pthread_cond_signal(&cbuf->cond[2]); + return NULL; +} + +void init_cblocks(args_t * const args, const int nb) { + args->cblock_buf.n_cblocks = nb; + args->cblock_buf.cblocks = calloc(nb, sizeof(bbi_cblock_t)); + for(int i = 0; i < args->cblock_buf.n_cblocks; i++) { + args->cblock_buf.cblocks[i].buf_p = malloc(sizeof(kstring_t)); + ks_initialize(args->cblock_buf.cblocks[i].buf_p); + } +} + +void clear_cblocks(args_t * const args) { + for(int i = 0; i < args->cblock_buf.n_cblocks; i++) { + args->cblock_buf.cblocks[i].state = cblock_empty; + ks_clear(args->cblock_buf.cblocks[i].buf_p); + } + args->cblock_buf.end_of_input = false; + args->cblock_buf.pos = 0; +} + +void destroy_cblocks(args_t * const args) { + if(args->cblock_buf.n_cblocks) { + for(int i = 0; i < args->cblock_buf.n_cblocks; i++) ks_free(args->cblock_buf.cblocks[i].buf_p); + free(args->cblock_buf.cblocks); + args->cblock_buf.cblocks = NULL; + args->cblock_buf.n_cblocks = 0; + } +} + +void finish_bb_block(args_t * const args, const int ctg_id, const int ix) { + bbi_global_data_t * const gdata = args->bb_global + ix; + kstring_t *buf = gdata->buffer; + if(ks_len(buf) > 0) { + // Get cblock to store block for compression + cblock_buffer_t * const cbuf = &args->cblock_buf; + const int pos = cbuf->pos; + pthread_mutex_lock(&cbuf->mut); + while(cbuf->cblocks[pos].state != cblock_empty) { + pthread_cond_wait(&cbuf->cond[0], &cbuf->mut); + } + bbi_data_t * const bdata = args->ctg_data[ctg_id].bbi_data + ix; + hts_resize(bbi_block_t, bdata->block_idx + 1, &bdata->block_sz, &bdata->blocks, 0); + bbi_block_t * const bl = bdata->blocks + (bdata->block_idx++); + memcpy(bl, &bdata->bbuf, sizeof(bdata->bbuf)); + bbi_cblock_t * const bp = cbuf->cblocks + pos; + bp->ix = ix; + gdata->buffer = bp->buf_p; + bp->buf_p = buf; + bp->ctg_id = ctg_id; + bp->block_idx = bdata->block_idx - 1; + bp->state = cblock_uncompressed; + pthread_mutex_unlock(&cbuf->mut); + pthread_cond_signal(&cbuf->cond[1]); + cbuf->pos = (pos + 1) % cbuf->n_cblocks; + if(gdata->first_time) { + gdata->first_ctg = ctg_id; + gdata->first_base = bdata->bbuf.start; + gdata->first_time = false; + } + gdata->last_ctg = ctg_id; + gdata->last_base = bdata->bbuf.end; + ks_clear(gdata->buffer); + memset(&bdata->bbuf, 0, sizeof(bdata->bbuf)); + bdata->n_items = 0; + } +} + +void finish_bw_block(args_t * const args, const int ctg_id, const int ix) { + bbi_data_t * const bdata = args->ctg_data[ctg_id].bbi_data + 3 + ix; + if(bdata->n_items > 0) { + // Write bigWig section to buffer + bbi_global_data_t * const gdata = args->bb_global + ix + 3; + gdata->n_rec++; + bw_rec_t * bw_rec = bdata->bw_rec; + kstring_t *buf = gdata->buffer; + uint32_t dat[5] = {ctg_id, bw_rec[0].start, bw_rec[bdata->n_items - 1].start + 1, 0, 1}; + uint8_t dat1[2] = {2, 0}; + uint16_t itemCount = bdata->n_items; + kputsn_((char *)dat, sizeof(uint32_t) * 5, ks_clear(buf)); + kputsn_((char *)dat1, 2, buf); + kputsn_((char *)&itemCount, 2, buf); + for(int i = 0; i < bdata->n_items; i++, bw_rec++) { + kputsn_((char *)&bw_rec->start, 4, buf); + kputsn_((char *)&bw_rec->val, 4, buf); + } + // Get cblock to store block for compression + cblock_buffer_t * const cbuf = &args->cblock_buf; + const int pos = cbuf->pos; + pthread_mutex_lock(&cbuf->mut); + while(cbuf->cblocks[pos].state != cblock_empty) { + pthread_cond_wait(&cbuf->cond[0], &cbuf->mut); + } + bbi_data_t * const bdata = args->ctg_data[ctg_id].bbi_data + 3 + ix; + hts_resize(bbi_block_t, bdata->block_idx + 1, &bdata->block_sz, &bdata->blocks, 0); + bbi_block_t * const bl = bdata->blocks + (bdata->block_idx++); + bl->start = dat[1]; + bl->end = dat[2]; + bbi_cblock_t * const bp = cbuf->cblocks + pos; + bp->ix = ix + 3; + gdata->buffer = bp->buf_p; + bp->buf_p = buf; + bp->ctg_id = ctg_id; + bp->block_idx = bdata->block_idx - 1; + bp->state = cblock_uncompressed; + pthread_mutex_unlock(&cbuf->mut); + pthread_cond_signal(&cbuf->cond[1]); + cbuf->pos = (pos + 1) % cbuf->n_cblocks; + if(gdata->first_time) { + gdata->first_ctg = ctg_id; + gdata->first_base = dat[1]; + gdata->first_time = false; + } + gdata->last_ctg = ctg_id; + gdata->last_base = dat[2]; + ks_clear(gdata->buffer); + memset(&bdata->bbuf, 0, sizeof(bdata->bbuf)); + bdata->n_items = 0; + } +} + +void finish_bbi_blocks(args_t * const args, const int ctg_id) { + for(int i = 0; i < 3; i++) finish_bb_block(args, ctg_id, i); + for(int i = 0; i < 2; i++) finish_bw_block(args, ctg_id, i); +} + +void finish_bb_data_file(args_t * const args, const int ix) { + FILE *fp = args->bigbedfiles[ix]; + args->bb_global[ix].index_offset = ftell(fp); +} + +void finish_bw_data_file(args_t * const args, const int ix) { + FILE *fp = args->bigwigfiles[ix]; + args->bb_global[ix + 3].index_offset = ftell(fp); +} + +// Create and write main index +void *bbi_create_index(void *p) { + bbi_thr_info_t * const bi = p; + args_t * const args = bi->args; + const int ix = bi->ix; + // Create and write main index + uint64_t pos = ftell(ix < 3 ? args->bigbedfiles[ix] : args->bigwigfiles[ix - 3]); + r_tree_t * const rtree = init_r_tree(args, ix, bi->nrec, pos); + write_r_tree(args, ix, rtree); + destroy_r_tree_and_data(args, ix, rtree); + return NULL; +} diff --git a/tools/utils/bbi.h b/tools/utils/bbi.h new file mode 100644 index 00000000..122b52d3 --- /dev/null +++ b/tools/utils/bbi.h @@ -0,0 +1,77 @@ +/* + * bbi.h + * + * Created on: Jan 7, 2020 + * Author: heath + */ + + + +#ifndef BBI_H_ +#define BBI_H_ + +#include "bbi_structs.h" + +#define bbi_write(stream,src) fwrite(&src, sizeof(src), 1, stream) + +typedef struct { + uint64_t n; // Number of leaf level nodes + uint64_t n_items; // Number of items + uint32_t block_size; + int depth; // Depth of tree (including leaves) + int wsize; // Size of width array + int *width; // Number of nodes at each level + void **start; // Starting position for each node (size is width + 1 with last element giving total number of nodes) +} tree_t; + +// Information about the contig B+ tree +typedef struct { + char **names; + uint64_t *len; + uint32_t key_len; + tree_t tree; +} contig_tree_t; + +typedef struct { + uint32_t start_ctg; + uint32_t end_ctg; + uint32_t start_base; + uint32_t end_base; + int start_idx; // Starting index for node (or data block) at lower level +} r_node_t; + +// Information about the R trees (main index + zoom indices) +typedef struct { + uint32_t ctg; + bbi_block_t * block; +} r_tree_block_t; + +typedef struct { + int nctgs; + uint64_t end_offset; + r_tree_block_t *blocks; + tree_t tree; +} r_tree_t; + +typedef struct { + args_t * args; + int ix; + uint32_t nrec; +} bbi_thr_info_t; + +void init_bbi_header(args_t * const args, const bool bigbed); +void write_bbi_header(FILE * const fp, bbi_header_t * const header, bbi_global_data_t * const bdata); +void finish_bb_block(args_t * const args, const int ctg_id, const int ix); +void finish_bw_block(args_t * const args, const int ctg_id, const int ix); +void finish_bbi_blocks(args_t * const args, const int ctg_id); +void finish_bb_data_file(args_t * const args, const int ix); +void finish_bw_data_file(args_t * const args, const int ix); +void *bbi_write_thread(void *p); +void *bbi_compress_thread(void *p); +void *handle_bedmethyl_thread(void *p); +void *bbi_create_index(void *p); +void init_cblocks(args_t * const args, const int nb); +void clear_cblocks(args_t * const args); +void destroy_cblocks(args_t * const args); + +#endif /* BBI_H_ */ diff --git a/tools/utils/bbi_defs.h b/tools/utils/bbi_defs.h new file mode 100644 index 00000000..eae4d183 --- /dev/null +++ b/tools/utils/bbi_defs.h @@ -0,0 +1,26 @@ +/* + * bbi_defs.h + * + * Created on: Jan 15, 2020 + * Author: heath + */ + +#ifndef BBI_DEFS_H_ +#define BBI_DEFS_H_ + +#define BLOCK_SIZE 256 +#define ITEMS_PER_SLOT 512 +#define BW_ITEMS_PER_SLOT 1024 + +#define INITIAL_REDUCTION 10 +#define BW_INITIAL_REDUCTION 40 +#define ZOOM_RES_INCREMENT 4 + +#define BBI_HEADER_SIZE 64 +#define EXT_HEADER_SIZE 64 +#define ZOOM_HEADER_SIZE 24 +#define TOTAL_SUMMARY_SIZE 40 + +#define ZOOM_LEVELS 10 + +#endif /* BBI_DEFS_H_ */ diff --git a/tools/utils/bbi_structs.h b/tools/utils/bbi_structs.h new file mode 100644 index 00000000..979b553d --- /dev/null +++ b/tools/utils/bbi_structs.h @@ -0,0 +1,142 @@ +/* + * bbi_structs.h + * + * Created on: Jan 10, 2020 + * Author: heath + */ + +#ifndef BBI_STRUCTS_H_ +#define BBI_STRUCTS_H_ + +#include "bbi_defs.h" + +// Main header of bbi file +typedef struct { + uint32_t magic; + uint16_t version; + uint16_t zoomLevels; + uint64_t chromosomeTreeOffset; + uint64_t fullDataOffset; + uint64_t fullIndexOffset; + uint16_t fieldCount; + uint16_t definedFieldCount; + uint64_t autoSqlOffset; + uint64_t totalSummaryOffset; + uint32_t uncompressBufSize; + uint64_t extensionOffset; +} bbi_header_t; + +typedef struct { + uint32_t start; + uint32_t end; + uint64_t offset; +} bbi_block_t; + +typedef enum { cblock_empty = 0, cblock_uncompressed, cblock_active, cblock_compressed } bbi_cblock_state_t; + +typedef struct { + int ctg_id; + int block_idx; + kstring_t *buf_p; + int ix; + bbi_cblock_state_t state; +} bbi_cblock_t; + +typedef struct { + uint32_t end_base; + uint32_t count; +} bb_zrec_t; + +typedef struct { + uint32_t end_base; + uint32_t count; + float x; + float xsq; + float min; + float max; +} bw_zrec_t; + +typedef struct { + union { + bb_zrec_t *bb_rec; + bw_zrec_t *bw_rec; + }; + int size; + int ix; +} bbi_zblock_t; + +typedef struct { + bw_zrec_t *bw_rec; + int size; + int ix; +} bw_zblock_t; + +typedef struct { + uint32_t start; + float val; +} bw_rec_t; + +typedef struct { + bbi_block_t bbuf; + bbi_block_t *blocks; + bbi_zblock_t zblock[ZOOM_LEVELS - 1]; + bw_rec_t bw_rec[BW_ITEMS_PER_SLOT]; + int n_items; + int block_idx; + int block_sz; +} bbi_data_t; + +// Stored data to allow generation of zoom levels for bigBed and bigWig files +// We store data on two bases in each byte of base_type; +// bits 4-7: base 1, bits 0-3: base 2 +// +// bits 0-1: bedmethyl_type +// bit 2: strand (0 == top, 1 == bottom) +// bit 3: non-zero methylation (1 == yes) +// bits 4-5: bedmethyl_type +// bit 6: strand (0 == top, 1 == bottom) +// bit 7: non-zero methylation (1 == yes) +// +// There is one zoom_data_t structure per contig +// + +typedef struct { + uint8_t *base_type; + float *val; + int val_size; + int val_ix; + uint64_t len; // Chromosome length +} zoom_dt_t; + +typedef struct { + zoom_dt_t zoom_data; + bbi_data_t bbi_data[5]; +} bbi_ctg_data_t; + +typedef struct { + kstring_t *buffer; + void *comp_buf; + uint32_t zoom_scales[ZOOM_LEVELS]; + size_t comp_buf_size; + uint32_t n_rec; + uint64_t index_offset; + uint64_t max_buf_size; + uint64_t zoom_data_offset[ZOOM_LEVELS]; + uint64_t zoom_index_offset[ZOOM_LEVELS]; + uint32_t first_base; + uint32_t last_base; + uint32_t first_ctg; + uint32_t last_ctg; + uint32_t n_cts; + uint32_t res_size[ZOOM_LEVELS]; + uint32_t res_end[ZOOM_LEVELS]; + uint64_t total_bases; + double min_x; + double max_x; + double sum_x; + double sum_xsq; + bool first_time; +} bbi_global_data_t; + + +#endif /* BBI_STRUCTS_H_ */ diff --git a/tools/utils/calc_gt_prob.c b/tools/utils/calc_gt_prob.c new file mode 100644 index 00000000..e4520dd2 --- /dev/null +++ b/tools/utils/calc_gt_prob.c @@ -0,0 +1,327 @@ +#include +#include +#include +#include + +#include "mextr.h" + +typedef struct { + double e, k, ln_k[3]; +} qual_prob; + +static qual_prob q_prob[MAX_QUAL + 1]; + +void fill_base_prob_table(void) { + for (int q = 0; q <= MAX_QUAL; q++) { + double e = exp(-.1 * (double)q * LOG10); + if(e > .5) e = .5; + double k = e / (3.0 - 4.0 * e); + q_prob[q].e = e; + q_prob[q].k = k; + q_prob[q].ln_k[0] = log(k); + q_prob[q].ln_k[1] = log(0.5 + k); + q_prob[q].ln_k[2] = log(1.0 + k); + } +} + +static inline void get_Z(double x1, double x2, double k1, double k2, double l, double t, double *Z) { + double lpt = l + t; + double lmt = l - t; + double d = (x1 + x2) * lmt; + // w = 1, p = 1 + double sinm = (x1 * (lpt + 2.0 * k2) - x2 * (2.0 - lpt + 2.0 * k1)) / d; + if(sinm < -1.0) sinm = -1.0; + else if(sinm > 1.0) sinm = 1.0; + Z[0] = 0.5 * (lmt * sinm + 2.0 - lpt); + // w = 1, p = 1/2 + sinm = (x1 * (2.0 + lpt + 4.0 * k2) - x2 * (2.0 - lpt + 4.0 * k1)) / d; + if(sinm < -1.0) sinm = -1.0; + else if(sinm > 1.0) sinm = 1.0; + Z[1] = 0.5 * (lmt * sinm + 2.0 - lpt); + // w = 1/2, p = 1 + sinm = (x1 * (lpt + 4.0 * k2) - x2 * (2.0 - lpt + 4.0 * k1)) / d; + if(sinm < -1.0) sinm = -1.0; + else if(sinm > 1.0) sinm = 1.0; + Z[2] = 0.5 * (lmt * sinm + 2.0 - lpt); +} + +static void add_bias(double *ll, char rf, double ref_bias) { + double lrb = log(ref_bias); + double lrb1 = log(0.5 * (1.0 + ref_bias)); + memset(ll, 0, sizeof(double) * 10); + switch (rf) { + case 'A': + ll[0] = lrb; + ll[1] = ll[2] = ll[3] = lrb1; + break; + case 'C': + ll[4] = lrb; + ll[1] = ll[5] = ll[6] = lrb1; + break; + case 'G': + ll[7] = lrb; + ll[2] = ll[5] = ll[8] = lrb1; + break; + case 'T': + ll[9] = lrb; + ll[3] = ll[6] = ll[8] = lrb1; + break; + } +} + +// This function is taken from genotype_model.c in bs_call +// As far as possible the two functions should be kept in sync +// (Yes, a shared library would make more sense...to do) +void calc_gt_prob(gt_meth *gt, args_t *const args, char rf) { + qual_prob qp[8]; + for(int i = 0; i < 8; i++) qp[i] = q_prob[gt->aqual[i]]; + double l = 1.0 - args->under_conv; + double t = args->over_conv; + double n[8]; + for (int i = 0; i < 8; i++) n[i] = (double)gt->counts[i]; + double ll[10]; + double ref_bias = args->ref_bias; + // Add in prior from reference + add_bias(ll, rf, ref_bias); + if (n[0]) { + ll[0] += n[0] * qp[0].ln_k[2]; // AA + double tz = n[0] * qp[0].ln_k[1]; + ll[1] += tz; // AC + ll[2] += tz; // AG + ll[3] += tz; // AT + tz = n[0] * qp[0].ln_k[0]; + ll[4] += tz; // CC + ll[5] += tz; // CG + ll[6] += tz; // CT + ll[7] += tz; // GG + ll[8] += tz; // GT + ll[9] += tz; // TT + } + if (n[1]) { + ll[4] += n[1] * qp[1].ln_k[2]; // CC + double tz = n[1] * qp[1].ln_k[1]; + ll[1] += tz; // AC + ll[5] += tz; // CG + ll[6] += tz; // CT + tz = n[1] * qp[1].ln_k[0]; + ll[0] += tz; // AA + ll[2] += tz; // AG + ll[3] += tz; // AT + ll[7] += tz; // GG + ll[8] += tz; // GT + ll[9] += tz; // TT + } + if (n[2]) { + ll[7] += n[2] * qp[2].ln_k[2]; // GG + double tz = n[2] * qp[2].ln_k[1]; + ll[2] += tz; // AG + ll[5] += tz; // CG + ll[8] += tz; // TG + tz = n[2] * qp[2].ln_k[0]; + ll[0] += tz; // AA + ll[1] += tz; // AC + ll[3] += tz; // AT + ll[4] += tz; // CC + ll[6] += tz; // CT + ll[9] += tz; // TT + } + if (n[3]) { + ll[9] += n[3] * qp[3].ln_k[2]; // TT + double tz = n[3] * qp[3].ln_k[1]; + ll[3] += tz; // AT + ll[6] += tz; // CT + ll[8] += tz; // GT + tz = n[3] * qp[3].ln_k[0]; + ll[0] += tz; // AA + ll[1] += tz; // AC + ll[2] += tz; // AG + ll[4] += tz; // CC + ll[5] += tz; // CG + ll[7] += tz; // GG + } + double Z[6] = {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0}; + if (n[5] + n[7] > 0.0) { + get_Z(n[5], n[7], qp[5].k, qp[7].k, l, t, Z); + for(int k = 0; k < 3; k++) gt->cmeth[k] = (Z[k] - 1.0 + l) / (l - t); + } + if (n[4] + n[6] > 0.0) { + get_Z(n[6], n[4], qp[6].k, qp[4].k, l, t, Z+3); + for(int k = 0; k < 3; k++) gt->gmeth[k] = (Z[k + 3] - 1.0 + l) / (l - t); + } + if (n[4]) { + ll[0] += n[4] * qp[4].ln_k[2]; // AA + ll[2] += log(1.0 - 0.5 * Z[4] + qp[4].k) * n[4]; // AG + ll[7] += log(1.0 - Z[3] + qp[4].k) * n[4]; // GG + double tz = log(0.5 * (1.0 - Z[5]) + qp[4].k) * n[4]; + ll[5] += tz; // CG + ll[8] += tz; // GT + tz = n[4] * qp[4].ln_k[1]; + ll[1] += tz; // AC + ll[3] += tz; // AT + tz = n[4] * qp[4].ln_k[0]; + ll[4] += tz; // CC + ll[6] += tz; // CT + ll[9] += tz; // TT + } + if (n[5]) { + ll[4] += log(Z[0] + qp[5].k) * n[5]; // CC + double tz = log(0.5 * Z[2] + qp[5].k) * n[5]; + ll[1] += tz; // AC + ll[5] += tz; // CG + ll[6] += log(0.5 * Z[1] + qp[5].k) * n[5]; // CT + tz = n[5] * qp[5].ln_k[0]; + ll[0] += tz; // AA + ll[2] += tz; // AG + ll[3] += tz; // AT + ll[7] += tz; // GG + ll[8] += tz; // GT + ll[9] += tz; // TT + } + if (n[6]) { + ll[7] += log(Z[3] + qp[6].k) * n[6]; // GG + double tz = log(0.5 * Z[5] + qp[6].k) * n[6]; + ll[5] += tz; // CG + ll[8] += tz; // TG + ll[2] += log(0.5 * Z[4] + qp[6].k) * n[6]; // AG + tz = n[6] * qp[6].ln_k[0]; + ll[0] += tz; // AA + ll[1] += tz; // AC + ll[3] += tz; // AT + ll[4] += tz; // CC + ll[6] += tz; // CT + ll[9] += tz; // TT + } + if (n[7]) { + ll[9] += n[7] * qp[7].ln_k[2]; // TT + ll[4] += log(1.0 - Z[0] + qp[7].k) * n[7]; // CC + ll[6] += log(1.0 - 0.5 * Z[1] + qp[7].k) * n[7]; // CT + double tz = log(0.5 * (1.0 - Z[2]) + qp[7].k) * n[7]; + ll[1] += tz; // AC + ll[5] += tz; // CG + tz = n[7] * qp[7].ln_k[1]; + ll[3] += tz; // AT + ll[8] += tz; // GT + tz = n[7] * qp[7].ln_k[0]; + ll[0] += tz; // AA + ll[2] += tz; // AG + ll[7] += tz; // GG + } + double max = ll[0]; + int mx = 0; + for (int i = 1; i < 10; i++) { + if (ll[i] > max) { + max = ll[i]; + mx = i; + } + } + gt->max_gt = mx; + double sum = 0.0; + for (int i = 0; i < 10; i++) { + sum += exp(ll[i] - max); + } + sum = log(sum); + gt->sum = sum + max; + for (int i = 0; i < 10; i++) { + gt->gt_prob[i] = (ll[i] - max - sum); + } +} + +static int gt_idx[10][2] = { + {-1, -1}, // AA + {2, -1}, // AC + {-1, 1}, // AG + {-1, -1}, // AT + {0, -1}, // CC + {2, 2}, // CG + {1, -1}, // CT + {-1, 0}, // GG + {-1, 2}, // GT + {-1, -1} // TT +}; + +double get_meth(gt_meth *g, int idx) { + double m = -1.0; + int i = gt_idx[g->max_gt][idx]; + if(i >= 0) m = idx ? g->gmeth[i] : g->cmeth[i]; + return m; +} + +// Calculate combined methylation for a CpG using information from both strands +// if available, taking account of the called genotypes. If information is not +// available from both strands, use the single site estimate of methylation +void calc_cpg_meth(args_t *const args, int ns, cpg_prob *cpg, gt_meth *g1, gt_meth *g2) { + double wval[3] = {1.0, 1.0, 0.5}; + double pval[3] = {1.0, 0.5, 1.0}; + for(int ix = 0; ix < ns; ix++) { + if(g1[ix].skip || g2[ix].skip) continue; + int gt1 = g1[ix].max_gt; + int gt2 = g2[ix].max_gt; + cpg[ix].max_gt[0] = gt1; + cpg[ix].max_gt[1] = gt2; + cpg[ix].prob_best = g1[ix].gt_prob[gt1] + g2[ix].gt_prob[gt2]; + cpg[ix].prob_cg = g1[ix].gt_prob[4] + g2[ix].gt_prob[7]; + // Calc meth + double n1[8], n2[8]; + qual_prob qp1[8], qp2[8]; + for (int i = 0; i < 8; i++) { + n1[i] = (double)g1[ix].counts[i]; + n2[i] = (double)g2[ix].counts[i]; + qp1[i] = q_prob[g1[ix].aqual[i]]; + qp2[i] = q_prob[g2[ix].aqual[i]]; + } + double l = 1.0 - args->under_conv; + double t = args->over_conv; + double g = (l - t) * 0.5; + double f = (2.0 - l - t) * 0.5; + double kc = qp1[5].k; + double kt = qp1[7].k; + double kg = qp2[6].k; + double ka = qp2[4].k; + int ix1 = gt_idx[gt1][0]; + int ix2 = gt_idx[gt2][1]; + if(ix1 >= 0) { + double w1 = wval[ix1]; + double p = pval[ix1]; + if(ix2 >= 0) { + double w2 = wval[ix2]; + double q = pval[ix2]; + // Get initial estimate + double m1 = (n1[5] + n2[6]) / (n1[5] + n2[6] + n1[7] * p + n2[4] * q); + double m = asin(2.0 * m1 - 1.0); + // Maximize using NR + for(int it = 0; it < 100; it++) { + double cosm = cos(m); + double sinm = sin(m); + double A = f + g * sinm; + double nm1 = g * p * w1 * cosm; + double d1 = p * w1 * A + kc; + double d2 = w1 * (1.0 - p * A) + kt; + double nm3 = g * q * w2 * cosm; + double d3 = q * w2 * A + kg; + double d4 = w2 * (1.0 - q * A) + ka; + double grad = nm1 * (n1[5] / d1 - n1[7] / d2) + nm3 * (n2[6] / d3 - n2[4] / d4); + if(fabs(grad) < 1.0e-8) { + m1 = 0.5 * (sinm + 1.0); + break; + } + double h = n1[5] * (nm1 * nm1 / d1 + g * p * w1 * sinm) / d1 + n1[7] * (nm1 * nm1 / d2 - g * p * w1 * sinm) / d2 + + n2[6] * (nm3 * nm3 / d3 + g * q * w2 * sinm) / d3 + n2[4] * (nm3 * nm3 / d4 - g * q * w2 * sinm) / d4; + m += grad / h; + } + cpg[ix].m = m1; + } else { + // Only the C+ has an estimate of methylation + double m1 = g1->cmeth[ix1]; + cpg[ix].m = m1; + } + } else if(ix2 >= 0) { + // Only the C- has an estimate of methylation + double m1 = g2->cmeth[ix2]; + cpg[ix].m = m1; + } else { + // No valud esetimates on either strand + cpg[ix].m = -1.0; + } + } +} + diff --git a/tools/utils/command_line.c b/tools/utils/command_line.c new file mode 100644 index 00000000..3e42bc1e --- /dev/null +++ b/tools/utils/command_line.c @@ -0,0 +1,375 @@ +/* + * command_line.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "htslib/hfile.h" +#include "htslib/khash_str2int.h" + +#include "utils.h" +#include "mextr.h" + +// These are copied from htslib:synced_bcf_reader.c as the definitions are not visible +// in the standard library and we need them to allow sorting of regions + +typedef struct { + hts_pos_t start, end; +} region1_t; + +struct _region_t { + region1_t *regs; + int nregs, mregs, creg; +}; + +const char *usage(void) { + return + "\n" + "About: Extract CpG and nonCpG sites.\n" + "Usage: mextr [file] [regions]\n" + "Options:\n" + " -o, --cpgfile Output file for CpG sites (default = stdout)\n" + " -n, --noncpgfile Output file for nonCpG sites (default, not output)\n" + " -b, --bed-methyl Output file base for bedMethly files. Not compatible with multi-sample files (default, not output)\n" + " -t, --bed-track-line Track line for for bedMethly files (default, info taken from input VCF file)\n" + " -S, --report-file Output file for JSON report (default, not output)\n" + " -r, --regions restrict to comma separated list of regions" + " -R, --regions-file restrict to regions listed in file" + " -@, --threads Extra threads" + " -H, --no_header Do not print header line(s) in output file(s) (default, false)\n" + " -g, --common-gt Recall genotypes assuming common genotypes across samples\n" + " -m, --mode Output mode for CpG sites\n" + " combined Generate one line per CpG with combined estimates (default)\n" + " strand-specific Generate two lines per CpG with the strand specific estimates\n" + " -w, --bw-mode Output mode for bigWig files\n" + " combined Generate one bigWig file for both strands (default)\n" + " strand-specific Generate one bigWig files for each strand\n" + " -s, --select Select mode for sites/CpGs\n" + " hom Select on homozygote sites/CpGs (default)\n" + " het Select on heterozygote sites/CpGs\n" + " -B, --reference-bias Reference bias for re-calling (default 2)\n" + " -M, --min-nc Minimum number of non-converted bases for non CpG site (default 1)\n" + " -p, --prop Minimum proportion of sites/CpGs that must pass (default 0.0)\n" + " -N, --number Minimum number of sites/CpGs that must pass (default 1)\n" + " -I, --inform Minimum number of informative reads for a CpG/site to pass (default 1)\n" + " -T, --threshold Phred scaled threshold probability of selecting sites/CpGs (default 20)\n" + " -c, --conversion , set under and over conversion rates\n" + " -Q, --bq-threshold Base qality threshold used for calling\n" + " -z, --bgzip Compress output with bgzip\n" + " -D, --md5 Calculate md5 digest for all output files\n" + " -x, --tabix Generate tabix (tbx) indices for cpg and noncpg files\n" + "\n"; +} + +static struct option loptions[] = { + {"cpgfile",required_argument,0,'o'}, + {"noncpgfile",required_argument,0,'n'}, + {"bed-methyl",required_argument,0,'b'}, + {"bed-track-line",required_argument,0,'t'}, + {"report-file",required_argument,0,'S'}, + {"regions",required_argument,0,'r'}, + {"regions-file",required_argument,0,'R'}, + {"threads",required_argument,0,'@'}, + {"no_header",no_argument,0,'H'}, + {"common-gt",no_argument,0,'g'}, + {"mode",required_argument,0,'m'}, + {"bw-mode",required_argument,0,'w'}, + {"select",required_argument,0,'s'}, + {"prop",required_argument,0,'p'}, + {"min-nc",required_argument,0,'M'}, + {"reference-bias",required_argument,0,'B'}, + {"number",required_argument,0,'N'}, + {"inform",required_argument,0,'I'}, + {"threshold",required_argument,0,'T'}, + {"conversion",required_argument,0,'c'}, + {"bq-conversion",required_argument,0,'Q'}, + {"bgzip",no_argument,0,'z'}, + {"md5",no_argument,0,'D'}, + {"tabix",no_argument,0,'x'}, + {"help",no_argument,0,'h'}, + {0,0,0,0} +}; + +// Try to parse the paramaters used for bs_call from the headers +static void check_hdr_params(args_t *a) { + char *par[] = {"under_conversion", "over_conversion", "mapq_thresh", "bq_thresh", NULL}; + bcf_hdr_t *h = a->hdr; + for(int i = 0; i < h->nhrec; i++) { + bcf_hrec_t *hr = h->hrec[i]; + if(hr->type == BCF_HL_GEN) { + if(!strcmp(hr->key, "source") && !strncmp(hr->value, "bs_call", 7)) { + char *p = strchr(hr->value, ','); + while(p != NULL) { + p++; + int ix; + for(ix = 0; par[ix] != NULL; ix++) if(!strncmp(p, par[ix], strlen(par[ix]))) break; + if(par[ix] != NULL) { + char *p1 = strchr(p, '='); + if(p1) { + switch(ix) { + case 0: + a->under_conv = strtod(p1 + 1, &p); + break; + case 1: + a->over_conv = strtod(p1 + 1, &p); + break; + case 2: + a->mq_thresh = (int)strtol(p1 + 1, &p, 10); + break; + case 3: + a->bq_thresh = (int)strtol(p1 + 1, &p, 10); + break; + } + } + } + p = strchr(p, ','); + } + } + } + } +} + +static int cmp_reg(const void *s1, const void *s2, void *a) { + const int *i1 = s1; + const int *i2 = s2; + const bcf_sr_regions_t *reg = a; + return strcmp(reg->seq_names[*i1], reg->seq_names[*i2]); +} + +struct tctg { + char *name; + uint64_t len; +}; + +static int cmp_tctg(const void *s1, const void *s2) { + const struct tctg *ctg1, *ctg2; + ctg1 = s1; + ctg2 = s2; + return strcmp(ctg1->name, ctg2->name); +} + +void handle_command_line(int argc, char *argv[], args_t * const args) { + int c; + bool regions_file = false; + char *regions_list = NULL; + while ((c = getopt_long(argc, argv, "?QDxh:o:c:b:n:r:s:w:@:m:R:M:I:S:p:B:N:T:t:gzHah?",loptions,NULL)) >= 0) { + switch (c) { + case 'o': + args->cpgfilename = optarg; + break; + case 'n': + args->noncpgfilename = optarg; + args->output_noncpg = true; + break; + case 'D': + args->calc_md5 = true; + break; + case 'x': + args->tabix = true; + break; + case 'S': + args->reportfilename = optarg; + break; + case 'B': + args->ref_bias = atof(optarg); + break; + case 'R': + regions_file = true; + // fall through + case 'r': + regions_list = optarg; + break; + case 'H': + args->header = false; + break; + case 'g': + args->common_gt = true; + break; + case 'w': + if(!strcasecmp(optarg, "combined")) args->strand_specific = false; + else if(!strcasecmp(optarg, "strand-specific")) args->strand_specific = true; + break; + case 's': + if(!strcasecmp(optarg, "hom")) args->sel_mode = SELECT_HOM; + else if(!strcasecmp(optarg, "het")) args->sel_mode = SELECT_HET; + else error ("s (select) option can be either 'hom' or 'het'\n"); + break; + case 'm': + if(!strcasecmp(optarg, "combined")) args->mode = CPGMODE_COMBINED; + else if(!strcasecmp(optarg, "strand-specific")) args->mode = CPGMODE_SEPARATE; + else error ("m (mode) option can be either 'combined' or 'strand-specific'\n"); + break; + case 'c': + if (sscanf(optarg, "%lf,%lf", &args->under_conv, &args->over_conv) != 2) + error("c (conversion) option expects two comma separated arguments)\n"); + break; + case 'p': + args->min_prop = atof(optarg); + if(args->min_prop < 0.0) args->min_prop = 0.0; + else if(args->min_prop > 1.0) args->min_prop = 1.0; + break; + case 'T': + args->sel_thresh = atoi(optarg); + if(args->sel_thresh < 0) args->sel_thresh = 0; + else if(args->sel_thresh > 255) args->sel_thresh = 255; + break; + case 'N': + args->min_num = atoi(optarg); + if(args->min_num < 1) args->min_num = 1; + break; + case 'b': + args->bedmethyl = optarg; + break; + case '@': + args->threads = atoi(optarg); + if(args->threads < 0) args->threads = 0; + break; + case 't': + args->bedmethyl_track_line = optarg; + break; + case 'I': + args->min_inform = atoi(optarg); + if(args->min_inform < 0) args->min_inform = 0; + break; + case 'M': + args->min_nc = atoi(optarg); + if(args->min_nc < 0) args->min_nc = 0; + break; + case 'Q': + args->bq_thresh = atoi(optarg); + break; + case 'z': + args->compress = true; + break; + case 'h': + case '?': + default: error(usage()); break; + } + } + char *fname = NULL; + if(optind == argc) error(usage()); + else fname = argv[optind]; + args->sr = bcf_sr_init(); + args->compress_threads = args->threads + 1; + bcf_sr_set_threads(args->sr, args->threads); + // Process region arguments if present + if(regions_list) { + if(bcf_sr_set_regions(args->sr, regions_list, regions_file) < 0) error("Failed to parse the regions: %s\n", regions_list); + } else if(optind + 1 < argc) { + kstring_t tmp = {0, 0, 0}; + kputs(argv[optind + 1], &tmp); + for(int k = optind + 2; k < argc; k++) { + kputc(',', &tmp); + kputs(argv[k], &tmp); + } + if(bcf_sr_set_regions(args->sr, tmp.s, 0) < 0) error("Failed to parse the regions: %s\n", tmp.s); + free(tmp.s); + } + + if(!bcf_sr_add_reader(args->sr, fname)) + error("failed to read from %s: %s\n", fname, bcf_sr_strerror(args->sr->errnum)); + + args->hdr = args->sr->readers[0].header; + if(args->sr->regions) { + bcf_sr_regions_t *reg = args->sr->regions; + // Sort regions by chromosome (required for bigWig, bigBed generation) + int *ix = malloc(sizeof(int) * reg->nseqs); + int nr = 0; + for(int i = 0; i < reg->nseqs; i++) { + if(bcf_hdr_name2id(args->hdr, reg->seq_names[i]) < 0) { + fprintf(stderr,"Warning: requested contig %s not present in input file\n", reg->seq_names[i]); + continue; + } + ix[nr++] = i; + } + if(!nr) error("None of the requested contigs are present in the input file\n"); + qsort_r(ix, nr, sizeof(int), cmp_reg, reg); + char **tseq = malloc(sizeof(char *) * nr); + struct _region_t *treg = malloc(sizeof(struct _region_t) * nr); + for(int i = 0; i < nr; i++) { + const int j = ix[i]; + tseq[i] = reg->seq_names[j]; + memcpy(treg + i, reg->regs + j, sizeof(struct _region_t)); + khash_str2int_set(reg->seq_hash, tseq[i], i); + } + free(reg->seq_names); + free(reg->regs); + reg->seq_names = tseq; + reg->regs = treg; + reg->nseqs = nr; + free(ix); + } + + // If no regions have been specified. + // extract all contigs from VCF/BCF header and add as regions after sorting + if(!args->sr->regions) { + int nctgs = args->hdr->n[BCF_DT_CTG]; + if(nctgs > 0) { + // We can't add regions with an open reader so we remove (close) it, add the regions + // and then re-open. Removing the reader will result in the header info being + // destroyed so we have to make a copy of the contig information beforehand + struct tctg * const ctgs = malloc(sizeof(struct tctg) * nctgs); + for(int i = 0; i < nctgs; i++) { + const bcf_idpair_t * const idp = args->hdr->id[BCF_DT_CTG] + i; + ctgs[i].name = strdup(idp->key); + ctgs[i].len = idp->val->info[0]; + } + qsort(ctgs, nctgs, sizeof(struct tctg), cmp_tctg); + bcf_sr_destroy(args->sr); + args->sr = bcf_sr_init(); + bcf_sr_set_threads(args->sr, args->threads); + bcf_sr_regions_t * const reg = calloc(1, sizeof(bcf_sr_regions_t)); + reg->start = reg->end = -1; + reg->prev_start = reg->prev_end = reg->prev_seq = -1; + reg->seq_hash = khash_str2int_init(); + reg->seq_names = calloc(nctgs, sizeof(char *)); + reg->regs = calloc(nctgs, sizeof(struct _region_t)); + for(int i = 0; i < nctgs; i++) { + reg->nseqs++; + reg->seq_names[i] = ctgs[i].name; + khash_str2int_set(reg->seq_hash, reg->seq_names[i], i); + reg->regs[i].creg = -1; + reg->regs[i].nregs = reg->regs[i].mregs = 1; + reg->regs[i].regs = malloc(sizeof(region1_t)); + reg->regs[i].regs->start = 0; + reg->regs[i].regs->end = ctgs[i].len - 1; + } + args->sr->regions = reg; + args->sr->explicit_regs = 1; + args->sr->require_index = 1; + free(ctgs); + if(!bcf_sr_add_reader(args->sr, fname)) + error("failed to read from %s: %s\n", fname, bcf_sr_strerror(args->sr->errnum)); + args->hdr = args->sr->readers[0].header; + } + } + bcf_sr_regions_t * const reg = args->sr->regions; + int nctgs = reg->nseqs; + args->cumul_len = malloc(sizeof(uint64_t) * nctgs); + args->cumul_len[0] = reg->regs[0].regs->end + 1; + for(int i = 1; i < nctgs; i++) args->cumul_len[i] = args->cumul_len[i - 1] + reg->regs[i].regs->end + 1; + int nctgs_vcf = args->hdr->n[BCF_DT_CTG]; + args->id_trans = malloc(sizeof(int) * nctgs_vcf); + for(int i = 0; i < nctgs_vcf; i++) { + const bcf_idpair_t * const idp = args->hdr->id[BCF_DT_CTG] + i; + int ret = khash_str2int_get(args->sr->regions->seq_hash, idp->key, args->id_trans + i); + if(ret < 0) args->id_trans[i] = -1; + } + int ns = bcf_hdr_nsamples(args->hdr); + assert(ns > 0); + for(int i = 0; i < REC_BUF_SIZE; i++) args->rec_buf.buf[i] = rec_init(ns); + if((args->bedmethyl) && ns > 1) error("bedMethyl output not compatible with multi-sample files\n"); + check_hdr_params(args); +} diff --git a/tools/utils/configure b/tools/utils/configure new file mode 100755 index 00000000..12aced1d --- /dev/null +++ b/tools/utils/configure @@ -0,0 +1,2847 @@ +#! /bin/sh +# Guess values for system-dependent variables and create Makefiles. +# Generated by GNU Autoconf 2.69 for gemBS_utils 1.0. +# +# +# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. +# +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in #( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in #(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + +# Use a proper internal environment variable to ensure we don't fall + # into an infinite loop, continuously re-executing ourselves. + if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then + _as_can_reexec=no; export _as_can_reexec; + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in # (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +as_fn_exit 255 + fi + # We don't want this to propagate to other subprocesses. + { _as_can_reexec=; unset _as_can_reexec;} +if test "x$CONFIG_SHELL" = x; then + as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which + # is contrary to our usage. Disable this feature. + alias -g '\${1+\"\$@\"}'='\"\$@\"' + setopt NO_GLOB_SUBST +else + case \`(set -o) 2>/dev/null\` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi +" + as_required="as_fn_return () { (exit \$1); } +as_fn_success () { as_fn_return 0; } +as_fn_failure () { as_fn_return 1; } +as_fn_ret_success () { return 0; } +as_fn_ret_failure () { return 1; } + +exitcode=0 +as_fn_success || { exitcode=1; echo as_fn_success failed.; } +as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; } +as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; } +as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; } +if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : + +else + exitcode=1; echo positional parameters were not saved. +fi +test x\$exitcode = x0 || exit 1 +test -x / || exit 1" + as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO + as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO + eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && + test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1" + if (eval "$as_required") 2>/dev/null; then : + as_have_required=yes +else + as_have_required=no +fi + if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then : + +else + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +as_found=false +for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + as_found=: + case $as_dir in #( + /*) + for as_base in sh bash ksh sh5; do + # Try only shells that exist, to save several forks. + as_shell=$as_dir/$as_base + if { test -f "$as_shell" || test -f "$as_shell.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then : + CONFIG_SHELL=$as_shell as_have_required=yes + if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then : + break 2 +fi +fi + done;; + esac + as_found=false +done +$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then : + CONFIG_SHELL=$SHELL as_have_required=yes +fi; } +IFS=$as_save_IFS + + + if test "x$CONFIG_SHELL" != x; then : + export CONFIG_SHELL + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in # (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +exit 255 +fi + + if test x$as_have_required = xno; then : + $as_echo "$0: This script requires a shell more modern than all" + $as_echo "$0: the shells that I found on your system." + if test x${ZSH_VERSION+set} = xset ; then + $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should" + $as_echo "$0: be upgraded to zsh 4.3.4 or later." + else + $as_echo "$0: Please tell bug-autoconf@gnu.org about your system, +$0: including any error possibly output before this +$0: message. Then install a modern shell, or manually run +$0: the script under such a shell if you do have one." + fi + exit 1 +fi +fi +fi +SHELL=${CONFIG_SHELL-/bin/sh} +export SHELL +# Unset more variables known to interfere with behavior of common tools. +CLICOLOR_FORCE= GREP_OPTIONS= +unset CLICOLOR_FORCE GREP_OPTIONS + +## --------------------- ## +## M4sh Shell Functions. ## +## --------------------- ## +# as_fn_unset VAR +# --------------- +# Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset + +# as_fn_set_status STATUS +# ----------------------- +# Set $? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} # as_fn_set_status + +# as_fn_exit STATUS +# ----------------- +# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} # as_fn_exit + +# as_fn_mkdir_p +# ------------- +# Create "$as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} # as_fn_mkdir_p + +# as_fn_executable_p FILE +# ----------------------- +# Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} # as_fn_executable_p +# as_fn_append VAR VALUE +# ---------------------- +# Append the text in VALUE to the end of the definition contained in VAR. Take +# advantage of any shell optimizations that allow amortized linear growth over +# repeated appends, instead of the typical quadratic growth present in naive +# implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +# as_fn_arith ARG... +# ------------------ +# Perform arithmetic evaluation on the ARGs, and store the result in the +# global $as_val. Take advantage of shells that can avoid forks. The arguments +# must be portable across $(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +# as_fn_error STATUS ERROR [LINENO LOG_FD] +# ---------------------------------------- +# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are +# provided, also output the error to LOG_FD, referencing LINENO. Then exit the +# script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} # as_fn_error + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + + + as_lineno_1=$LINENO as_lineno_1a=$LINENO + as_lineno_2=$LINENO as_lineno_2a=$LINENO + eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" && + test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || { + # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-) + sed -n ' + p + /[$]LINENO/= + ' <$as_myself | + sed ' + s/[$]LINENO.*/&-/ + t lineno + b + :lineno + N + :loop + s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ + t loop + s/-\n.*// + ' >$as_me.lineno && + chmod +x "$as_me.lineno" || + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } + + # If we had to re-execute with $CONFIG_SHELL, we're ensured to have + # already done that, so ensure we don't try to do so again and fall + # in an infinite loop. This has already happened in practice. + _as_can_reexec=no; export _as_can_reexec + # Don't try to exec as it changes $[0], causing all sort of problems + # (the dirname of $[0] is not the place where we might find the + # original and so on. Autoconf is especially sensitive to this). + . "./$as_me.lineno" + # Exit status is that of the last command. + exit +} + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in #((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -pR'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -pR' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -pR' + fi +else + as_ln_s='cp -pR' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + +as_test_x='test -x' +as_executable_p=as_fn_executable_p + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +test -n "$DJDIR" || exec 7<&0 &1 + +# Name of the host. +# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status, +# so uname gets run too. +ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` + +# +# Initializations. +# +ac_default_prefix=/usr/local +ac_clean_files= +ac_config_libobj_dir=. +LIBOBJS= +cross_compiling=no +subdirs= +MFLAGS= +MAKEFLAGS= + +# Identity of this package. +PACKAGE_NAME='gemBS_utils' +PACKAGE_TARNAME='gembs_utils' +PACKAGE_VERSION='1.0' +PACKAGE_STRING='gemBS_utils 1.0' +PACKAGE_BUGREPORT='' +PACKAGE_URL='' + +ac_subst_vars='LTLIBOBJS +LIBOBJS +HTSINC +HTSLIBS +target_alias +host_alias +build_alias +LIBS +ECHO_T +ECHO_N +ECHO_C +DEFS +mandir +localedir +libdir +psdir +pdfdir +dvidir +htmldir +infodir +docdir +oldincludedir +includedir +localstatedir +sharedstatedir +sysconfdir +datadir +datarootdir +libexecdir +sbindir +bindir +program_transform_name +prefix +exec_prefix +PACKAGE_URL +PACKAGE_BUGREPORT +PACKAGE_STRING +PACKAGE_VERSION +PACKAGE_TARNAME +PACKAGE_NAME +PATH_SEPARATOR +SHELL' +ac_subst_files='' +ac_user_opts=' +enable_option_checking +with_htslib +' + ac_precious_vars='build_alias +host_alias +target_alias' + + +# Initialize some variables set by options. +ac_init_help= +ac_init_version=false +ac_unrecognized_opts= +ac_unrecognized_sep= +# The variables have the same names as the options, with +# dashes changed to underlines. +cache_file=/dev/null +exec_prefix=NONE +no_create= +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +verbose= +x_includes=NONE +x_libraries=NONE + +# Installation directory options. +# These are left unexpanded so users can "make install exec_prefix=/foo" +# and all the variables that are supposed to be based on exec_prefix +# by default will actually change. +# Use braces instead of parens because sh, perl, etc. also accept them. +# (The list follows the same order as the GNU Coding Standards.) +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datarootdir='${prefix}/share' +datadir='${datarootdir}' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +includedir='${prefix}/include' +oldincludedir='/usr/include' +docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' +infodir='${datarootdir}/info' +htmldir='${docdir}' +dvidir='${docdir}' +pdfdir='${docdir}' +psdir='${docdir}' +libdir='${exec_prefix}/lib' +localedir='${datarootdir}/locale' +mandir='${datarootdir}/man' + +ac_prev= +ac_dashdash= +for ac_option +do + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval $ac_prev=\$ac_option + ac_prev= + continue + fi + + case $ac_option in + *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; + *=) ac_optarg= ;; + *) ac_optarg=yes ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case $ac_dashdash$ac_option in + --) + ac_dashdash=yes ;; + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir=$ac_optarg ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build_alias ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build_alias=$ac_optarg ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file=$ac_optarg ;; + + --config-cache | -C) + cache_file=config.cache ;; + + -datadir | --datadir | --datadi | --datad) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=*) + datadir=$ac_optarg ;; + + -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \ + | --dataroo | --dataro | --datar) + ac_prev=datarootdir ;; + -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \ + | --dataroot=* | --dataroo=* | --dataro=* | --datar=*) + datarootdir=$ac_optarg ;; + + -disable-* | --disable-*) + ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=no ;; + + -docdir | --docdir | --docdi | --doc | --do) + ac_prev=docdir ;; + -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*) + docdir=$ac_optarg ;; + + -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv) + ac_prev=dvidir ;; + -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*) + dvidir=$ac_optarg ;; + + -enable-* | --enable-*) + ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=\$ac_optarg ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix=$ac_optarg ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he | -h) + ac_init_help=long ;; + -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) + ac_init_help=recursive ;; + -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) + ac_init_help=short ;; + + -host | --host | --hos | --ho) + ac_prev=host_alias ;; + -host=* | --host=* | --hos=* | --ho=*) + host_alias=$ac_optarg ;; + + -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht) + ac_prev=htmldir ;; + -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \ + | --ht=*) + htmldir=$ac_optarg ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir=$ac_optarg ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir=$ac_optarg ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir=$ac_optarg ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir=$ac_optarg ;; + + -localedir | --localedir | --localedi | --localed | --locale) + ac_prev=localedir ;; + -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*) + localedir=$ac_optarg ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst | --locals) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*) + localstatedir=$ac_optarg ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir=$ac_optarg ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c | -n) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir=$ac_optarg ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix=$ac_optarg ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix=$ac_optarg ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix=$ac_optarg ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name=$ac_optarg ;; + + -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd) + ac_prev=pdfdir ;; + -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*) + pdfdir=$ac_optarg ;; + + -psdir | --psdir | --psdi | --psd | --ps) + ac_prev=psdir ;; + -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*) + psdir=$ac_optarg ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir=$ac_optarg ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir=$ac_optarg ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site=$ac_optarg ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir=$ac_optarg ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir=$ac_optarg ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target_alias ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target_alias=$ac_optarg ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers | -V) + ac_init_version=: ;; + + -with-* | --with-*) + ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=\$ac_optarg ;; + + -without-* | --without-*) + ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=no ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes=$ac_optarg ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries=$ac_optarg ;; + + -*) as_fn_error $? "unrecognized option: \`$ac_option' +Try \`$0 --help' for more information" + ;; + + *=*) + ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` + # Reject names that are not valid shell variable names. + case $ac_envvar in #( + '' | [0-9]* | *[!_$as_cr_alnum]* ) + as_fn_error $? "invalid variable name: \`$ac_envvar'" ;; + esac + eval $ac_envvar=\$ac_optarg + export $ac_envvar ;; + + *) + # FIXME: should be removed in autoconf 3.0. + $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 + expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && + $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 + : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}" + ;; + + esac +done + +if test -n "$ac_prev"; then + ac_option=--`echo $ac_prev | sed 's/_/-/g'` + as_fn_error $? "missing argument to $ac_option" +fi + +if test -n "$ac_unrecognized_opts"; then + case $enable_option_checking in + no) ;; + fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;; + *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; + esac +fi + +# Check all directory arguments for consistency. +for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ + datadir sysconfdir sharedstatedir localstatedir includedir \ + oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ + libdir localedir mandir +do + eval ac_val=\$$ac_var + # Remove trailing slashes. + case $ac_val in + */ ) + ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'` + eval $ac_var=\$ac_val;; + esac + # Be sure to have absolute directory names. + case $ac_val in + [\\/$]* | ?:[\\/]* ) continue;; + NONE | '' ) case $ac_var in *prefix ) continue;; esac;; + esac + as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val" +done + +# There might be people who depend on the old broken behavior: `$host' +# used to hold the argument of --host etc. +# FIXME: To remove some day. +build=$build_alias +host=$host_alias +target=$target_alias + +# FIXME: To remove some day. +if test "x$host_alias" != x; then + if test "x$build_alias" = x; then + cross_compiling=maybe + elif test "x$build_alias" != "x$host_alias"; then + cross_compiling=yes + fi +fi + +ac_tool_prefix= +test -n "$host_alias" && ac_tool_prefix=$host_alias- + +test "$silent" = yes && exec 6>/dev/null + + +ac_pwd=`pwd` && test -n "$ac_pwd" && +ac_ls_di=`ls -di .` && +ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || + as_fn_error $? "working directory cannot be determined" +test "X$ac_ls_di" = "X$ac_pwd_ls_di" || + as_fn_error $? "pwd does not report name of working directory" + + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then the parent directory. + ac_confdir=`$as_dirname -- "$as_myself" || +$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_myself" : 'X\(//\)[^/]' \| \ + X"$as_myself" : 'X\(//\)$' \| \ + X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_myself" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + srcdir=$ac_confdir + if test ! -r "$srcdir/$ac_unique_file"; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r "$srcdir/$ac_unique_file"; then + test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." + as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir" +fi +ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" +ac_abs_confdir=`( + cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg" + pwd)` +# When building in place, set srcdir=. +if test "$ac_abs_confdir" = "$ac_pwd"; then + srcdir=. +fi +# Remove unnecessary trailing slashes from srcdir. +# Double slashes in file names in object file debugging info +# mess up M-x gdb in Emacs. +case $srcdir in +*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;; +esac +for ac_var in $ac_precious_vars; do + eval ac_env_${ac_var}_set=\${${ac_var}+set} + eval ac_env_${ac_var}_value=\$${ac_var} + eval ac_cv_env_${ac_var}_set=\${${ac_var}+set} + eval ac_cv_env_${ac_var}_value=\$${ac_var} +done + +# +# Report the --help message. +# +if test "$ac_init_help" = "long"; then + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat <<_ACEOF +\`configure' configures gemBS_utils 1.0 to adapt to many kinds of systems. + +Usage: $0 [OPTION]... [VAR=VALUE]... + +To assign environment variables (e.g., CC, CFLAGS...), specify them as +VAR=VALUE. See below for descriptions of some of the useful variables. + +Defaults for the options are specified in brackets. + +Configuration: + -h, --help display this help and exit + --help=short display options specific to this package + --help=recursive display the short help of all the included packages + -V, --version display version information and exit + -q, --quiet, --silent do not print \`checking ...' messages + --cache-file=FILE cache test results in FILE [disabled] + -C, --config-cache alias for \`--cache-file=config.cache' + -n, --no-create do not create output files + --srcdir=DIR find the sources in DIR [configure dir or \`..'] + +Installation directories: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [PREFIX] + +By default, \`make install' will install all the files in +\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify +an installation prefix other than \`$ac_default_prefix' using \`--prefix', +for instance \`--prefix=\$HOME'. + +For better control, use the options below. + +Fine tuning of the installation directories: + --bindir=DIR user executables [EPREFIX/bin] + --sbindir=DIR system admin executables [EPREFIX/sbin] + --libexecdir=DIR program executables [EPREFIX/libexec] + --sysconfdir=DIR read-only single-machine data [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] + --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --libdir=DIR object code libraries [EPREFIX/lib] + --includedir=DIR C header files [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc [/usr/include] + --datarootdir=DIR read-only arch.-independent data root [PREFIX/share] + --datadir=DIR read-only architecture-independent data [DATAROOTDIR] + --infodir=DIR info documentation [DATAROOTDIR/info] + --localedir=DIR locale-dependent data [DATAROOTDIR/locale] + --mandir=DIR man documentation [DATAROOTDIR/man] + --docdir=DIR documentation root [DATAROOTDIR/doc/gembs_utils] + --htmldir=DIR html documentation [DOCDIR] + --dvidir=DIR dvi documentation [DOCDIR] + --pdfdir=DIR pdf documentation [DOCDIR] + --psdir=DIR ps documentation [DOCDIR] +_ACEOF + + cat <<\_ACEOF +_ACEOF +fi + +if test -n "$ac_init_help"; then + case $ac_init_help in + short | recursive ) echo "Configuration of gemBS_utils 1.0:";; + esac + cat <<\_ACEOF + +Optional Packages: + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --with-htslib=PATH specify prefix directory for installed htslib + library. + +Report bugs to the package provider. +_ACEOF +ac_status=$? +fi + +if test "$ac_init_help" = "recursive"; then + # If there are subdirs, report their specific --help. + for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue + test -d "$ac_dir" || + { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } || + continue + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + cd "$ac_dir" || { ac_status=$?; continue; } + # Check for guested configure. + if test -f "$ac_srcdir/configure.gnu"; then + echo && + $SHELL "$ac_srcdir/configure.gnu" --help=recursive + elif test -f "$ac_srcdir/configure"; then + echo && + $SHELL "$ac_srcdir/configure" --help=recursive + else + $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 + fi || ac_status=$? + cd "$ac_pwd" || { ac_status=$?; break; } + done +fi + +test -n "$ac_init_help" && exit $ac_status +if $ac_init_version; then + cat <<\_ACEOF +gemBS_utils configure 1.0 +generated by GNU Autoconf 2.69 + +Copyright (C) 2012 Free Software Foundation, Inc. +This configure script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it. +_ACEOF + exit +fi + +## ------------------------ ## +## Autoconf initialization. ## +## ------------------------ ## +cat >config.log <<_ACEOF +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. + +It was created by gemBS_utils $as_me 1.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + $ $0 $@ + +_ACEOF +exec 5>>config.log +{ +cat <<_ASUNAME +## --------- ## +## Platform. ## +## --------- ## + +hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` + +/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` +/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown` +/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` +/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` + +_ASUNAME + +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + $as_echo "PATH: $as_dir" + done +IFS=$as_save_IFS + +} >&5 + +cat >&5 <<_ACEOF + + +## ----------- ## +## Core tests. ## +## ----------- ## + +_ACEOF + + +# Keep a trace of the command line. +# Strip out --no-create and --no-recursion so they do not pile up. +# Strip out --silent because we don't want to record it for future runs. +# Also quote any args containing shell meta-characters. +# Make two passes to allow for proper duplicate-argument suppression. +ac_configure_args= +ac_configure_args0= +ac_configure_args1= +ac_must_keep_next=false +for ac_pass in 1 2 +do + for ac_arg + do + case $ac_arg in + -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + continue ;; + *\'*) + ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + case $ac_pass in + 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;; + 2) + as_fn_append ac_configure_args1 " '$ac_arg'" + if test $ac_must_keep_next = true; then + ac_must_keep_next=false # Got value, back to normal. + else + case $ac_arg in + *=* | --config-cache | -C | -disable-* | --disable-* \ + | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ + | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ + | -with-* | --with-* | -without-* | --without-* | --x) + case "$ac_configure_args0 " in + "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; + esac + ;; + -* ) ac_must_keep_next=true ;; + esac + fi + as_fn_append ac_configure_args " '$ac_arg'" + ;; + esac + done +done +{ ac_configure_args0=; unset ac_configure_args0;} +{ ac_configure_args1=; unset ac_configure_args1;} + +# When interrupted or exit'd, cleanup temporary files, and complete +# config.log. We remove comments because anyway the quotes in there +# would cause problems or look ugly. +# WARNING: Use '\'' to represent an apostrophe within the trap. +# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. +trap 'exit_status=$? + # Save into config.log some information that might help in debugging. + { + echo + + $as_echo "## ---------------- ## +## Cache variables. ## +## ---------------- ##" + echo + # The following way of writing the cache mishandles newlines in values, +( + for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + (set) 2>&1 | + case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + sed -n \ + "s/'\''/'\''\\\\'\'''\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p" + ;; #( + *) + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) + echo + + $as_echo "## ----------------- ## +## Output variables. ## +## ----------------- ##" + echo + for ac_var in $ac_subst_vars + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + + if test -n "$ac_subst_files"; then + $as_echo "## ------------------- ## +## File substitutions. ## +## ------------------- ##" + echo + for ac_var in $ac_subst_files + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + fi + + if test -s confdefs.h; then + $as_echo "## ----------- ## +## confdefs.h. ## +## ----------- ##" + echo + cat confdefs.h + echo + fi + test "$ac_signal" != 0 && + $as_echo "$as_me: caught signal $ac_signal" + $as_echo "$as_me: exit $exit_status" + } >&5 + rm -f core *.core core.conftest.* && + rm -f -r conftest* confdefs* conf$$* $ac_clean_files && + exit $exit_status +' 0 +for ac_signal in 1 2 13 15; do + trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal +done +ac_signal=0 + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -f -r conftest* confdefs.h + +$as_echo "/* confdefs.h */" > confdefs.h + +# Predefined preprocessor variables. + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_NAME "$PACKAGE_NAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_TARNAME "$PACKAGE_TARNAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_VERSION "$PACKAGE_VERSION" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_STRING "$PACKAGE_STRING" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_URL "$PACKAGE_URL" +_ACEOF + + +# Let the site file select an alternate cache file if it wants to. +# Prefer an explicitly selected file to automatically selected ones. +ac_site_file1=NONE +ac_site_file2=NONE +if test -n "$CONFIG_SITE"; then + # We do not want a PATH search for config.site. + case $CONFIG_SITE in #(( + -*) ac_site_file1=./$CONFIG_SITE;; + */*) ac_site_file1=$CONFIG_SITE;; + *) ac_site_file1=./$CONFIG_SITE;; + esac +elif test "x$prefix" != xNONE; then + ac_site_file1=$prefix/share/config.site + ac_site_file2=$prefix/etc/config.site +else + ac_site_file1=$ac_default_prefix/share/config.site + ac_site_file2=$ac_default_prefix/etc/config.site +fi +for ac_site_file in "$ac_site_file1" "$ac_site_file2" +do + test "x$ac_site_file" = xNONE && continue + if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 +$as_echo "$as_me: loading site script $ac_site_file" >&6;} + sed 's/^/| /' "$ac_site_file" >&5 + . "$ac_site_file" \ + || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "failed to load site script $ac_site_file +See \`config.log' for more details" "$LINENO" 5; } + fi +done + +if test -r "$cache_file"; then + # Some versions of bash will fail to source /dev/null (special files + # actually), so we avoid doing that. DJGPP emulates it as a regular file. + if test /dev/null != "$cache_file" && test -f "$cache_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 +$as_echo "$as_me: loading cache $cache_file" >&6;} + case $cache_file in + [\\/]* | ?:[\\/]* ) . "$cache_file";; + *) . "./$cache_file";; + esac + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 +$as_echo "$as_me: creating cache $cache_file" >&6;} + >$cache_file +fi + +# Check that the precious variables saved in the cache have kept the same +# value. +ac_cache_corrupted=false +for ac_var in $ac_precious_vars; do + eval ac_old_set=\$ac_cv_env_${ac_var}_set + eval ac_new_set=\$ac_env_${ac_var}_set + eval ac_old_val=\$ac_cv_env_${ac_var}_value + eval ac_new_val=\$ac_env_${ac_var}_value + case $ac_old_set,$ac_new_set in + set,) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,set) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,);; + *) + if test "x$ac_old_val" != "x$ac_new_val"; then + # differences in whitespace do not lead to failure. + ac_old_val_w=`echo x $ac_old_val` + ac_new_val_w=`echo x $ac_new_val` + if test "$ac_old_val_w" != "$ac_new_val_w"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 +$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} + ac_cache_corrupted=: + else + { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 +$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} + eval $ac_var=\$ac_old_val + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 +$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 +$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} + fi;; + esac + # Pass precious variables to config.status. + if test "$ac_new_set" = set; then + case $ac_new_val in + *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; + *) ac_arg=$ac_var=$ac_new_val ;; + esac + case " $ac_configure_args " in + *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. + *) as_fn_append ac_configure_args " '$ac_arg'" ;; + esac + fi +done +if $ac_cache_corrupted; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 +$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} + as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5 +fi +## -------------------- ## +## Main body of script. ## +## -------------------- ## + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + + + +# Check whether --with-htslib was given. +if test "${with_htslib+set}" = set; then : + withval=$with_htslib; +fi + +if test "x$with_htslib" != "x"; then + HTSLIBS="$with_htslib/libhts.a -lcurl -lcrypto -llzma" + HTSINC="-I$with_htslib" +else + HTSLIBS="-lhts -lcurl -lcrypto -llzma" + HTSINC="" +fi + + + + +ac_config_files="$ac_config_files Makefile" + +cat >confcache <<\_ACEOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs, see configure's option --config-cache. +# It is not useful on other systems. If it contains results you don't +# want to keep, you may remove or edit it. +# +# config.status only pays attention to the cache file if you give it +# the --recheck option to rerun configure. +# +# `ac_cv_env_foo' variables (set or unset) will be overridden when +# loading this file, other *unset* `ac_cv_foo' will be assigned the +# following values. + +_ACEOF + +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, we kill variables containing newlines. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +( + for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + + (set) 2>&1 | + case $as_nl`(ac_space=' '; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + # `set' does not quote correctly, so add quotes: double-quote + # substitution turns \\\\ into \\, and sed turns \\ into \. + sed -n \ + "s/'/'\\\\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" + ;; #( + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) | + sed ' + /^ac_cv_env_/b end + t clear + :clear + s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ + t end + s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ + :end' >>confcache +if diff "$cache_file" confcache >/dev/null 2>&1; then :; else + if test -w "$cache_file"; then + if test "x$cache_file" != "x/dev/null"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 +$as_echo "$as_me: updating cache $cache_file" >&6;} + if test ! -f "$cache_file" || test -h "$cache_file"; then + cat confcache >"$cache_file" + else + case $cache_file in #( + */* | ?:*) + mv -f confcache "$cache_file"$$ && + mv -f "$cache_file"$$ "$cache_file" ;; #( + *) + mv -f confcache "$cache_file" ;; + esac + fi + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 +$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} + fi +fi +rm -f confcache + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Transform confdefs.h into DEFS. +# Protect against shell expansion while executing Makefile rules. +# Protect against Makefile macro expansion. +# +# If the first sed substitution is executed (which looks for macros that +# take arguments), then branch to the quote section. Otherwise, +# look for a macro that doesn't take arguments. +ac_script=' +:mline +/\\$/{ + N + s,\\\n,, + b mline +} +t clear +:clear +s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g +t quote +s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g +t quote +b any +:quote +s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g +s/\[/\\&/g +s/\]/\\&/g +s/\$/$$/g +H +:any +${ + g + s/^\n// + s/\n/ /g + p +} +' +DEFS=`sed -n "$ac_script" confdefs.h` + + +ac_libobjs= +ac_ltlibobjs= +U= +for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue + # 1. Remove the extension, and $U if already installed. + ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' + ac_i=`$as_echo "$ac_i" | sed "$ac_script"` + # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR + # will be set to the directory where LIBOBJS objects are built. + as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext" + as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo' +done +LIBOBJS=$ac_libobjs + +LTLIBOBJS=$ac_ltlibobjs + + + +: "${CONFIG_STATUS=./config.status}" +ac_write_fail=0 +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files $CONFIG_STATUS" +{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 +$as_echo "$as_me: creating $CONFIG_STATUS" >&6;} +as_write_fail=0 +cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 +#! $SHELL +# Generated by $as_me. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. + +debug=false +ac_cs_recheck=false +ac_cs_silent=false + +SHELL=\${CONFIG_SHELL-$SHELL} +export SHELL +_ASEOF +cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1 +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in #( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in #(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + + +# as_fn_error STATUS ERROR [LINENO LOG_FD] +# ---------------------------------------- +# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are +# provided, also output the error to LOG_FD, referencing LINENO. Then exit the +# script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} # as_fn_error + + +# as_fn_set_status STATUS +# ----------------------- +# Set $? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} # as_fn_set_status + +# as_fn_exit STATUS +# ----------------- +# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} # as_fn_exit + +# as_fn_unset VAR +# --------------- +# Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset +# as_fn_append VAR VALUE +# ---------------------- +# Append the text in VALUE to the end of the definition contained in VAR. Take +# advantage of any shell optimizations that allow amortized linear growth over +# repeated appends, instead of the typical quadratic growth present in naive +# implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +# as_fn_arith ARG... +# ------------------ +# Perform arithmetic evaluation on the ARGs, and store the result in the +# global $as_val. Take advantage of shells that can avoid forks. The arguments +# must be portable across $(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in #((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -pR'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -pR' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -pR' + fi +else + as_ln_s='cp -pR' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + + +# as_fn_mkdir_p +# ------------- +# Create "$as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} # as_fn_mkdir_p +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + + +# as_fn_executable_p FILE +# ----------------------- +# Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} # as_fn_executable_p +as_test_x='test -x' +as_executable_p=as_fn_executable_p + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +exec 6>&1 +## ----------------------------------- ## +## Main body of $CONFIG_STATUS script. ## +## ----------------------------------- ## +_ASEOF +test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1 + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# Save the log message, to keep $0 and so on meaningful, and to +# report actual input values of CONFIG_FILES etc. instead of their +# values after options handling. +ac_log=" +This file was extended by gemBS_utils $as_me 1.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + CONFIG_FILES = $CONFIG_FILES + CONFIG_HEADERS = $CONFIG_HEADERS + CONFIG_LINKS = $CONFIG_LINKS + CONFIG_COMMANDS = $CONFIG_COMMANDS + $ $0 $@ + +on `(hostname || uname -n) 2>/dev/null | sed 1q` +" + +_ACEOF + +case $ac_config_files in *" +"*) set x $ac_config_files; shift; ac_config_files=$*;; +esac + + + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +# Files that config.status was made for. +config_files="$ac_config_files" + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +ac_cs_usage="\ +\`$as_me' instantiates files and other configuration actions +from templates according to the current configuration. Unless the files +and actions are specified as TAGs, all are instantiated by default. + +Usage: $0 [OPTION]... [TAG]... + + -h, --help print this help, then exit + -V, --version print version number and configuration settings, then exit + --config print configuration, then exit + -q, --quiet, --silent + do not print progress messages + -d, --debug don't remove temporary files + --recheck update $as_me by reconfiguring in the same conditions + --file=FILE[:TEMPLATE] + instantiate the configuration file FILE + +Configuration files: +$config_files + +Report bugs to the package provider." + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" +ac_cs_version="\\ +gemBS_utils config.status 1.0 +configured by $0, generated by GNU Autoconf 2.69, + with options \\"\$ac_cs_config\\" + +Copyright (C) 2012 Free Software Foundation, Inc. +This config.status script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it." + +ac_pwd='$ac_pwd' +srcdir='$srcdir' +test -n "\$AWK" || AWK=awk +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# The default lists apply if the user does not specify any file. +ac_need_defaults=: +while test $# != 0 +do + case $1 in + --*=?*) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` + ac_shift=: + ;; + --*=) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg= + ac_shift=: + ;; + *) + ac_option=$1 + ac_optarg=$2 + ac_shift=shift + ;; + esac + + case $ac_option in + # Handling of the options. + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + ac_cs_recheck=: ;; + --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) + $as_echo "$ac_cs_version"; exit ;; + --config | --confi | --conf | --con | --co | --c ) + $as_echo "$ac_cs_config"; exit ;; + --debug | --debu | --deb | --de | --d | -d ) + debug=: ;; + --file | --fil | --fi | --f ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + '') as_fn_error $? "missing file argument" ;; + esac + as_fn_append CONFIG_FILES " '$ac_optarg'" + ac_need_defaults=false;; + --he | --h | --help | --hel | -h ) + $as_echo "$ac_cs_usage"; exit ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil | --si | --s) + ac_cs_silent=: ;; + + # This is an error. + -*) as_fn_error $? "unrecognized option: \`$1' +Try \`$0 --help' for more information." ;; + + *) as_fn_append ac_config_targets " $1" + ac_need_defaults=false ;; + + esac + shift +done + +ac_configure_extra_args= + +if $ac_cs_silent; then + exec 6>/dev/null + ac_configure_extra_args="$ac_configure_extra_args --silent" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +if \$ac_cs_recheck; then + set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion + shift + \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 + CONFIG_SHELL='$SHELL' + export CONFIG_SHELL + exec "\$@" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +exec 5>>config.log +{ + echo + sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX +## Running $as_me. ## +_ASBOX + $as_echo "$ac_log" +} >&5 + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 + +# Handling of arguments. +for ac_config_target in $ac_config_targets +do + case $ac_config_target in + "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; + + *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; + esac +done + + +# If the user did not use the arguments to specify the items to instantiate, +# then the envvar interface is used. Set only those that are not. +# We use the long form for the default assignment because of an extremely +# bizarre bug on SunOS 4.1.3. +if $ac_need_defaults; then + test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files +fi + +# Have a temporary directory for convenience. Make it in the build tree +# simply because there is no reason against having it here, and in addition, +# creating and moving files from /tmp can sometimes cause problems. +# Hook for its removal unless debugging. +# Note that there is a small window in which the directory will not be cleaned: +# after its creation but before its name has been assigned to `$tmp'. +$debug || +{ + tmp= ac_tmp= + trap 'exit_status=$? + : "${ac_tmp:=$tmp}" + { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status +' 0 + trap 'as_fn_exit 1' 1 2 13 15 +} +# Create a (secure) tmp directory for tmp files. + +{ + tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && + test -d "$tmp" +} || +{ + tmp=./conf$$-$RANDOM + (umask 077 && mkdir "$tmp") +} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5 +ac_tmp=$tmp + +# Set up the scripts for CONFIG_FILES section. +# No need to generate them if there are no CONFIG_FILES. +# This happens for instance with `./config.status config.h'. +if test -n "$CONFIG_FILES"; then + + +ac_cr=`echo X | tr X '\015'` +# On cygwin, bash can eat \r inside `` if the user requested igncr. +# But we know of no other shell where ac_cr would be empty at this +# point, so we can use a bashism as a fallback. +if test "x$ac_cr" = x; then + eval ac_cr=\$\'\\r\' +fi +ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' /dev/null` +if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then + ac_cs_awk_cr='\\r' +else + ac_cs_awk_cr=$ac_cr +fi + +echo 'BEGIN {' >"$ac_tmp/subs1.awk" && +_ACEOF + + +{ + echo "cat >conf$$subs.awk <<_ACEOF" && + echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && + echo "_ACEOF" +} >conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 +ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'` +ac_delim='%!_!# ' +for ac_last_try in false false false false false :; do + . ./conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + + ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` + if test $ac_delim_n = $ac_delim_num; then + break + elif $ac_last_try; then + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + else + ac_delim="$ac_delim!$ac_delim _$ac_delim!! " + fi +done +rm -f conf$$subs.sh + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK && +_ACEOF +sed -n ' +h +s/^/S["/; s/!.*/"]=/ +p +g +s/^[^!]*!// +:repl +t repl +s/'"$ac_delim"'$// +t delim +:nl +h +s/\(.\{148\}\)..*/\1/ +t more1 +s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ +p +n +b repl +:more1 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t nl +:delim +h +s/\(.\{148\}\)..*/\1/ +t more2 +s/["\\]/\\&/g; s/^/"/; s/$/"/ +p +b +:more2 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t delim +' >$CONFIG_STATUS || ac_write_fail=1 +rm -f conf$$subs.awk +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACAWK +cat >>"\$ac_tmp/subs1.awk" <<_ACAWK && + for (key in S) S_is_set[key] = 1 + FS = "" + +} +{ + line = $ 0 + nfields = split(line, field, "@") + substed = 0 + len = length(field[1]) + for (i = 2; i < nfields; i++) { + key = field[i] + keylen = length(key) + if (S_is_set[key]) { + value = S[key] + line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) + len += length(value) + length(field[++i]) + substed = 1 + } else + len += 1 + keylen + } + + print line +} + +_ACAWK +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then + sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" +else + cat +fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \ + || as_fn_error $? "could not setup config files machinery" "$LINENO" 5 +_ACEOF + +# VPATH may cause trouble with some makes, so we remove sole $(srcdir), +# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and +# trailing colons and then remove the whole line if VPATH becomes empty +# (actually we leave an empty line to preserve line numbers). +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[ ]*/{ +h +s/// +s/^/:/ +s/[ ]*$/:/ +s/:\$(srcdir):/:/g +s/:\${srcdir}:/:/g +s/:@srcdir@:/:/g +s/^:*// +s/:*$// +x +s/\(=[ ]*\).*/\1/ +G +s/\n// +s/^[^=]*=[ ]*$// +}' +fi + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +fi # test -n "$CONFIG_FILES" + + +eval set X " :F $CONFIG_FILES " +shift +for ac_tag +do + case $ac_tag in + :[FHLC]) ac_mode=$ac_tag; continue;; + esac + case $ac_mode$ac_tag in + :[FHL]*:*);; + :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;; + :[FH]-) ac_tag=-:-;; + :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; + esac + ac_save_IFS=$IFS + IFS=: + set x $ac_tag + IFS=$ac_save_IFS + shift + ac_file=$1 + shift + + case $ac_mode in + :L) ac_source=$1;; + :[FH]) + ac_file_inputs= + for ac_f + do + case $ac_f in + -) ac_f="$ac_tmp/stdin";; + *) # Look for the file first in the build tree, then in the source tree + # (if the path is not absolute). The absolute path cannot be DOS-style, + # because $ac_f cannot contain `:'. + test -f "$ac_f" || + case $ac_f in + [\\/$]*) false;; + *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; + esac || + as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; + esac + case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac + as_fn_append ac_file_inputs " '$ac_f'" + done + + # Let's still pretend it is `configure' which instantiates (i.e., don't + # use $as_me), people would be surprised to read: + # /* config.h. Generated by config.status. */ + configure_input='Generated from '` + $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' + `' by configure.' + if test x"$ac_file" != x-; then + configure_input="$ac_file. $configure_input" + { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 +$as_echo "$as_me: creating $ac_file" >&6;} + fi + # Neutralize special characters interpreted by sed in replacement strings. + case $configure_input in #( + *\&* | *\|* | *\\* ) + ac_sed_conf_input=`$as_echo "$configure_input" | + sed 's/[\\\\&|]/\\\\&/g'`;; #( + *) ac_sed_conf_input=$configure_input;; + esac + + case $ac_tag in + *:-:* | *:-) cat >"$ac_tmp/stdin" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; + esac + ;; + esac + + ac_dir=`$as_dirname -- "$ac_file" || +$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$ac_file" : 'X\(//\)[^/]' \| \ + X"$ac_file" : 'X\(//\)$' \| \ + X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$ac_file" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + as_dir="$ac_dir"; as_fn_mkdir_p + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + + case $ac_mode in + :F) + # + # CONFIG_FILE + # + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# If the template does not know about datarootdir, expand it. +# FIXME: This hack should be removed a few years after 2.60. +ac_datarootdir_hack=; ac_datarootdir_seen= +ac_sed_dataroot=' +/datarootdir/ { + p + q +} +/@datadir@/p +/@docdir@/p +/@infodir@/p +/@localedir@/p +/@mandir@/p' +case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in +*datarootdir*) ac_datarootdir_seen=yes;; +*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 +$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 + ac_datarootdir_hack=' + s&@datadir@&$datadir&g + s&@docdir@&$docdir&g + s&@infodir@&$infodir&g + s&@localedir@&$localedir&g + s&@mandir@&$mandir&g + s&\\\${datarootdir}&$datarootdir&g' ;; +esac +_ACEOF + +# Neutralize VPATH when `$srcdir' = `.'. +# Shell code in configure.ac might set extrasub. +# FIXME: do we really want to maintain this feature? +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_sed_extra="$ac_vpsub +$extrasub +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +:t +/@[a-zA-Z_][a-zA-Z_0-9]*@/!b +s|@configure_input@|$ac_sed_conf_input|;t t +s&@top_builddir@&$ac_top_builddir_sub&;t t +s&@top_build_prefix@&$ac_top_build_prefix&;t t +s&@srcdir@&$ac_srcdir&;t t +s&@abs_srcdir@&$ac_abs_srcdir&;t t +s&@top_srcdir@&$ac_top_srcdir&;t t +s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t +s&@builddir@&$ac_builddir&;t t +s&@abs_builddir@&$ac_abs_builddir&;t t +s&@abs_top_builddir@&$ac_abs_top_builddir&;t t +$ac_datarootdir_hack +" +eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \ + >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + +test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && + { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && + { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ + "$ac_tmp/out"`; test -z "$ac_out"; } && + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&5 +$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&2;} + + rm -f "$ac_tmp/stdin" + case $ac_file in + -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";; + *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";; + esac \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + ;; + + + + esac + +done # for ac_tag + + +as_fn_exit 0 +_ACEOF +ac_clean_files=$ac_clean_files_save + +test $ac_write_fail = 0 || + as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5 + + +# configure is writing to config.log, and then calls config.status. +# config.status does its own redirection, appending to config.log. +# Unfortunately, on DOS this fails, as config.log is still kept open +# by configure, so config.status won't be able to write to it; its +# output is simply discarded. So we exec the FD to /dev/null, +# effectively closing config.log, so it can be properly (re)opened and +# appended to by config.status. When coming back to configure, we +# need to make the FD available again. +if test "$no_create" != yes; then + ac_cs_success=: + ac_config_status_args= + test "$silent" = yes && + ac_config_status_args="$ac_config_status_args --quiet" + exec 5>/dev/null + $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false + exec 5>>config.log + # Use ||, not &&, to avoid exiting from the if with $? = 1, which + # would make configure fail if this is the last instruction. + $ac_cs_success || as_fn_exit 1 +fi +if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 +$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} +fi + diff --git a/tools/utils/configure.ac b/tools/utils/configure.ac new file mode 100644 index 00000000..69a50a42 --- /dev/null +++ b/tools/utils/configure.ac @@ -0,0 +1,18 @@ +AC_INIT([gemBS_utils],[1.0]) + +AC_ARG_WITH(htslib, +[AS_HELP_STRING([--with-htslib=PATH], + [specify prefix directory for installed htslib library.])]) +if test "x$with_htslib" != "x"; then + HTSLIBS="$with_htslib/libhts.a -lcurl -lcrypto -llzma" + HTSINC="-I$with_htslib" +else + HTSLIBS="-lhts -lcurl -lcrypto -llzma" + HTSINC="" +fi + +AC_SUBST([HTSLIBS]) +AC_SUBST([HTSINC]) + +AC_CONFIG_FILES([Makefile]) +AC_OUTPUT diff --git a/tools/utils/files.c b/tools/utils/files.c new file mode 100644 index 00000000..87b4d163 --- /dev/null +++ b/tools/utils/files.c @@ -0,0 +1,173 @@ +/* + * files.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mextr.h" +#include "bbi.h" +#include "htslib/bgzf.h" +#include "htslib/hfile.h" + +static htsFile *open_ofile(char ** const name, bool compress, args_t * const a) { + + htsFile *fp = NULL; + if(name != NULL) { + char *tname = *name; + char mode[3] = {'w', 0, 0}; + bool stream = !strcmp(tname, "-"); + + // If output is to a file then if compression has been asked for + // we add '.gz' to the filename unless already present. If compression + // had not been asked for but the filename ends in '.gz' then we + // turn on compression + + if(!stream) { + // Check if file name ends in '.gz' + char *p = strrchr(tname, '.'); + bool has_gz = p && !strcmp(p + 1, "gz"); + if(compress) { + if(!has_gz) { + tname = malloc(strlen(*name) + 4); + sprintf(tname, "%s.gz", *name); + } + } else compress = has_gz; + } else { + // Turn off compression if output is to a terminal + if(compress && isatty(fileno(stdout))) compress = false; + } + if(compress) mode[1] = 'z'; + hFILE *hfile = hopen(tname, mode); + if(!hfile) error("Couldn't open output file: %s\n", stream ? "" : tname); + fp = hts_hopen(hfile, tname, mode); + if(a->threads > 0) hts_set_opt(fp, HTS_OPT_THREAD_POOL, a->sr->p); + if(tname != *name) *name = tname; + } + return fp; +} + +void init_files(args_t * const a) { + if(a->cpgfilename) a->cpgfile = open_ofile(&a->cpgfilename, a->compress, a); + if(a->noncpgfilename) a->noncpgfile = open_ofile(&a->noncpgfilename, a->compress, a); + if(a->bedmethyl != NULL) { + if(!strcmp(a->bedmethyl, "-")) error("Bedmethyl files can not be output to stdout"); + bool compress = a->compress; + char *p = strrchr(a->bedmethyl, '.'); + if(p && !strcmp(p + 1, "gz")) { + *p = 0; + compress = true; + } + p = strrchr(a->bedmethyl, '.'); + if(p && !strcmp(p + 1, "bed")) *p = 0; + p = strrchr(a->bedmethyl, '_'); + if(p && !(strcmp(p + 1, "cpg") || strcmp(p + 1, "chg") || strcmp(p + 1, "chh")))*p = 0; + const size_t l = strlen(a->bedmethyl) + 9 + (compress ? 3 : 0); + const int ct = a->strand_specific ? 8 : 7; + p = malloc(l * ct); + a->bedmethylnames[0] = p; + a->bedmethylnames[1] = p + l; + a->bedmethylnames[2] = p + 2 * l; + a->bigbednames[0] = p + 3 * l; + a->bigbednames[1] = p + 4 * l; + a->bigbednames[2] = p + 5 * l; + a->bigwignames[0] = p + 6 * l; + if(a->strand_specific) a->bigwignames[1] = p + 7 * l; + + sprintf(a->bedmethylnames[BEDMETHYL_CPG - 1], "%s_cpg.%s", a->bedmethyl, compress ? "bed.gz" : "bed"); + sprintf(a->bedmethylnames[BEDMETHYL_CHG - 1], "%s_chg.%s", a->bedmethyl, compress ? "bed.gz" : "bed"); + sprintf(a->bedmethylnames[BEDMETHYL_CHH - 1], "%s_chh.%s", a->bedmethyl, compress ? "bed.gz" : "bed"); + sprintf(a->bigbednames[BEDMETHYL_CPG - 1], "%s_cpg.bb", a->bedmethyl); + sprintf(a->bigbednames[BEDMETHYL_CHG - 1], "%s_chg.bb", a->bedmethyl); + sprintf(a->bigbednames[BEDMETHYL_CHH - 1], "%s_chh.bb", a->bedmethyl); + if(a->strand_specific) { + sprintf(a->bigwignames[0], "%s_pos.bw", a->bedmethyl); + sprintf(a->bigwignames[1], "%s_neg.bw", a->bedmethyl); + } else sprintf(a->bigwignames[0], "%s.bw", a->bedmethyl); + for(int i = 0; i < 3; i++) { + a->bedmethylfiles[i] = open_ofile(&a->bedmethylnames[i], compress, a); + a->bigbedfiles[i]= fopen(a->bigbednames[i], "wb"); + } + for(int i = 0; i < 2; i++) { + if(a->bigwignames[i]) a->bigwigfiles[i] = fopen(a->bigwignames[i], "w"); + } + init_bbi_header(a, true); // bigBed headers + init_bbi_header(a, false); // bigWig headers + init_cblocks(a, 4 * a->compress_threads); + } +} + +void close_files(args_t *a) { + if(a->cpgfile != NULL) hts_close(a->cpgfile); + if(a->noncpgfile != NULL) hts_close(a->noncpgfile); + for(int i = 0; i < 3; i++) { + if(a->bedmethylfiles[i] != NULL) hts_close(a->bedmethylfiles[i]); + if(a->bigbedfiles[i] != NULL) fclose(a->bigbedfiles[i]); + } + for(int i = 0; i < 2; i++) { + if(a->bigwigfiles[i] != NULL) fclose(a->bigwigfiles[i]); + } + while(waitpid(-1, NULL, 0) > 0); +} + +#define MD5_BUF_SIZE 4096 + +void calc_stream_md5(FILE * const fp, char * const md5) { + MD5_CTX ctx; + MD5_Init(&ctx); + uint8_t buf[MD5_BUF_SIZE]; + while(!feof(fp)) { + size_t len = fread(buf, 1, MD5_BUF_SIZE, fp); + if(len > 0) MD5_Update(&ctx, buf, len); + } + unsigned char b[16]; + const char *hex_digits="0123456789abcdef"; + MD5_Final(b, &ctx); + int k = 0; + for(int i = 0; i < 16; i++) { + md5[k++] = hex_digits[b[i] >> 4]; + md5[k++] = hex_digits[b[i] & 0xf]; + } + md5[k] = 0; +} + +void calc_file_md5(char * const name) { + char *tname = malloc(strlen(name) + 5); + sprintf(tname, "%s.md5", name); + FILE *in = fopen(name, "rb"); + FILE *out = NULL; + int err = 0; + if(in == NULL) { + fprintf(stderr, "calc_file_md5(): Could not open file %s for input: %s\n", name, strerror(errno)); + err = 1; + } else { + out = fopen(tname, "wb"); + if(out == NULL) { + fprintf(stderr, "calc_file_md5(): Could not open file %s for output: %s\n", tname, strerror(errno)); + err = 2; + } + } + char md5[33]; + if(!err) { +#ifndef __MACH__ + posix_fadvise(fileno(in), 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + calc_stream_md5(in, md5); + fprintf(out,"%s %s\n", md5, name); + } + if(out) fclose(out); + if(in) fclose(in); + free(tname); +} + diff --git a/tools/utils/init_params.c b/tools/utils/init_params.c new file mode 100644 index 00000000..a367f390 --- /dev/null +++ b/tools/utils/init_params.c @@ -0,0 +1,56 @@ +/* + * init_params.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "mextr.h" +#include "bbi.h" + +#include "htslib/hfile.h" + +void init_params(args_t *const args) { + memset(args, 0, sizeof(args_t)); + args->bedmethyl_desc = "."; + args->stats = NULL; + args->min_prop = 0.0; + args->min_num = 1; + args->min_inform = 0; + args->min_nc = 1; + args->threads = 1; + args->ref_bias = DEFAULT_REF_BIAS; + args->under_conv = DEFAULT_UNDER_CONV; + args->over_conv = DEFAULT_OVER_CONV; + args->bq_thresh = DEFAULT_BQ_THRESH; + args->mq_thresh = DEFAULT_MAPQ_THRESH; + args->mode = CPGMODE_COMBINED; + args->sel_mode = SELECT_HOM; + args->sel_thresh = DEFAULT_SELECT_THRESH; + args->header = true; + pthread_mutex_init(&args->read_buf.mut, NULL); + pthread_mutex_init(&args->rec_buf.mut, NULL); + pthread_mutex_init(&args->cblock_buf.mut, NULL); + for(int i = 0; i < 3; i++) { + pthread_cond_init(args->cblock_buf.cond + i, NULL); + } + for(int i = 0; i < 2; i++) pthread_cond_init(args->read_buf.cond + i, NULL); + for(int i = 0; i < 5; i++) { + args->bb_global[i].buffer = malloc(sizeof(kstring_t)); + ks_initialize(args->bb_global[i].buffer); + args->bb_global[i].first_time = true; + args->bb_global[i].zoom_scales[0] = i < 3 ? INITIAL_REDUCTION : BW_INITIAL_REDUCTION; + for(int j = 1; j < ZOOM_LEVELS; j++) args->bb_global[i].zoom_scales[j] = args->bb_global[i].zoom_scales[j - 1] * ZOOM_RES_INCREMENT; + } + for(int i = 0; i < 3; i++) ks_initialize(args->pr_str + i); +} diff --git a/tools/utils/mextr.c b/tools/utils/mextr.c new file mode 100644 index 00000000..3173a99d --- /dev/null +++ b/tools/utils/mextr.c @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "mextr.h" +#include "bbi.h" + +#include "htslib/hfile.h" +#include "htslib/bgzf.h" + +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(-1); +} + +void destroy(args_t * const args) +{ + write_stats(args); +} + +int main(int argc, char **argv) { + + args_t args; + init_params(&args); + handle_command_line(argc, argv, &args); + int ns = bcf_hdr_nsamples(args.hdr); + assert(ns > 0); + init_files(&args); + pthread_t cpg_thr, noncpg_thr, wig_thr, bedmethyl_thr; + thr_info_t noncpg_info = {&args, output_noncpg, RJ_OUTPUT_NONCPG}; + thr_info_t bedmethyl_info = {&args, output_bedmethyl, RJ_OUTPUT_BEDMETHYL}; + if(args.cpgfile) { + args.job_mask |= RJ_OUTPUT_CPG; + pthread_create(&cpg_thr, NULL, cpg_thread, &args); + } + if(args.noncpgfile) { + args.job_mask |= RJ_OUTPUT_NONCPG; + pthread_create(&noncpg_thr, NULL, output_thread, &noncpg_info); + } + if(args.bedmethyl) { + args.job_mask |= RJ_OUTPUT_BEDMETHYL; + pthread_create(&bedmethyl_thr, NULL, handle_bedmethyl_thread, &bedmethyl_info); + } + + if(args.header || args.bedmethyl) print_headers(&args); + if(args.reportfilename != NULL) init_stats(&args); + + args.sample_Q[0] = malloc(sizeof(double) * ((ns + 1) * 2 + ns)); + args.sample_Q[1] = args.sample_Q[0] + ns + 1; + args.sample_Q[2] = args.sample_Q[1] + ns + 1; + args.sample_Q1[0] = malloc(sizeof(double) * ((ns + 1) * 2 + ns)); + args.sample_Q1[1] = args.sample_Q1[0] + ns + 1; + args.sample_Q1[2] = args.sample_Q1[1] + ns + 1; + args.sample_cpg = (args.mode == CPGMODE_COMBINED) ? malloc(sizeof(cpg_prob) * ns) : NULL; + fill_base_prob_table(); + pthread_t read_thr; + pthread_create(&read_thr, NULL, read_thread, &args); + int nt = 1; + // If the user has asked for more threads we will take one extra thread for the bcf unpacking - more than this is rarely useful + if(args.threads > 1) { + nt++; + args.threads--; + } + pthread_t * const unpack_bcf_thr = malloc(sizeof(pthread_t) * nt); + gthr_info_t * const gthr_info = malloc(sizeof(gthr_info_t) * nt); + for(int i = 0; i < nt; i++) { + gthr_info[i].args = &args; + gthr_info[i].thread_idx = i; + pthread_create(unpack_bcf_thr + i, NULL, unpack_bcf_thread, gthr_info + i); + } + pthread_t handle_rec_buf_thr; + pthread_create(&handle_rec_buf_thr, NULL, handle_rec_buf, &args); + pthread_join(read_thr, NULL); + for(int i = 0; i < nt; i++) pthread_join(unpack_bcf_thr[i], NULL); + args.proc_finished = true; + pthread_join(handle_rec_buf_thr, NULL); + args.rec_finished = true; + if(args.cpgfilename) pthread_join(cpg_thr, NULL); + if(args.noncpgfilename) pthread_join(noncpg_thr, NULL); + if(args.bedmethyl) pthread_join(bedmethyl_thr, NULL); + fprintf(stderr,"mextr: finished\n"); + destroy(&args); + bcf_sr_destroy(args.sr); + return 0; +} + + diff --git a/tools/utils/mextr.h b/tools/utils/mextr.h new file mode 100644 index 00000000..ad7672ac --- /dev/null +++ b/tools/utils/mextr.h @@ -0,0 +1,216 @@ +#ifndef MEXTR_H_ +#define MEXTR_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bbi_structs.h" + +#define LOG10 2.30258509299404568402 + +#define DEFAULT_UNDER_CONV 0.01 +#define DEFAULT_OVER_CONV 0.05 +#define DEFAULT_MAPQ_THRESH 20 +#define DEFAULT_BQ_THRESH 20 +#define DEFAULT_REF_BIAS 2 +#define MAX_QUAL 43 +#define DEFAULT_SELECT_THRESH 20 + +#define READ_BUF_SIZE 1024 +#define REC_BUF_SIZE 2048 + +void error(const char *format, ...) HTS_NORETURN; + +typedef struct { + uint64_t n_sites; + uint64_t n_sites_pass; +} stats_t; + +typedef enum {FMT_FT, FMT_MC8, FMT_AMQ, FMT_CX, FMT_AQ, FMT_MQ, FMT_GQ, FMT_GOF, FMT_GL} fmt_tag; +typedef enum {CPGMODE_COMBINED, CPGMODE_SEPARATE} cpg_mode; +typedef enum {SELECT_HOM, SELECT_HET} select_mode; +typedef enum {BEDMETHYL_NONE = 0, BEDMETHYL_CPG, BEDMETHYL_CHG, BEDMETHYL_CHH} bedmethyl_type; + +typedef struct { + void *dat_p; + int dat_n; + int ne; +} fmt_store_t; + +typedef struct { + char *tag; + int type; + fmt_store_t st[2]; +} fmt_field_t; + +typedef struct { + int32_t counts[8]; + int32_t aqual[8]; // Average base quality + double gt_prob[10]; // Genotype log probabilities (Log10) + double cmeth[3], gmeth[3]; + double sum; + uint8_t max_gt; + bool skip; +} gt_meth; + +typedef struct { + double prob_best; + double prob_cg; + uint8_t max_gt[2]; + double m; +} cpg_prob; + +#define RJ_OUTPUT_CPG 1 +#define RJ_OUTPUT_NONCPG 2 +#define RJ_OUTPUT_BEDMETHYL 4 +#define RJ_ALL (RJ_OUTPUT_CPG | RJ_OUTPUT_NONCPG | RJ_OUTPUT_BEDMETHYL) +#define REC_SKIP 64 +#define REC_READY 128 + +#define N_FTAGS 6 + +typedef struct _rec { + int32_t rid; + hts_pos_t pos; + char ref; + char cx[5]; + uint8_t tasks; + uint8_t max_common_gt; + fmt_store_t tags[N_FTAGS]; + gt_meth *sample_gt; +} rec_t; + +typedef struct { + bcf1_t *buf[READ_BUF_SIZE]; + uint64_t idx[READ_BUF_SIZE]; + int read_pos; + int write_pos; + pthread_mutex_t mut; + pthread_cond_t cond[2]; +} bcf1_buffer_t; + +typedef struct { + rec_t *buf[REC_BUF_SIZE]; + uint64_t first_index; + pthread_mutex_t mut; +} rec_buffer_t; + +typedef struct { + bbi_cblock_t *cblocks; // Buffer for holding output bbi data for compression + uint32_t pos; + int n_cblocks; + pthread_mutex_t mut; + pthread_cond_t cond[3]; + bool end_of_input; +} cblock_buffer_t; + +typedef struct { + bcf_hdr_t *hdr; + char *cpgfilename; + char *noncpgfilename; + char *reportfilename; + char *bedmethyl; + char *bedmethylnames[3]; + char *bigbednames[3]; + char *bigwignames[2]; + char *bedmethyl_track_line; + char *bedmethyl_desc; + double *sample_Q[3]; + double *sample_Q1[3]; + cpg_prob *sample_cpg; + htsFile *cpgfile; + htsFile *noncpgfile; + FILE *reportfile; + htsFile *bedmethylfiles[3]; + FILE *bigbedfiles[3]; + FILE *bigwigfiles[2]; + bbi_header_t *bbi_hdr[2]; + bbi_ctg_data_t *ctg_data; + uint64_t *cumul_len; + int *id_trans; // translate from contig ids in BCF/VCF to contig ids in bbi files + kstring_t pr_str[3]; + bbi_global_data_t bb_global[5]; + stats_t *stats; + cpg_mode mode; + select_mode sel_mode; + bcf_srs_t *sr; + bcf1_buffer_t read_buf; + rec_buffer_t rec_buf; + cblock_buffer_t cblock_buf; + int sel_thresh; + int threads; + int compress_threads; + bool compress; + bool common_gt; + bool output_noncpg; + bool header; + bool proc_finished; + bool input_finished; + bool rec_finished; + bool strand_specific; + bool calc_md5; + bool tabix; + uint8_t job_mask; + double min_prop; + int min_num; + int min_inform; + int min_nc; + double ref_bias; + double under_conv; + double over_conv; + int bq_thresh; + int mq_thresh; +} args_t; + +typedef struct { + args_t * args; + void (*output)(args_t *const, const rec_t * const); + uint8_t job_flag; +} thr_info_t; + +typedef struct { + args_t * args; + int thread_idx; +} gthr_info_t; + +void calc_gt_prob(gt_meth *gt, args_t *args, char rf); +void calc_cpg_meth(args_t * const args, int ns, cpg_prob *cpg, gt_meth *g1, gt_meth *g2); +double get_meth(gt_meth *g, int idx); +void output_cpg(args_t *const args, rec_t ** const lrec, const int idx); +void output_noncpg(args_t *const args, const rec_t * const rec); +void output_bedmethyl(args_t *const args, const rec_t * const rec); +void fill_base_prob_table(void); +void print_headers(args_t *args); +int calc_phred(double z); +double *get_prob_dist(int ns, double *Q[]); +char trans_base[256]; + +#define ks_output(fp, s) { \ + int r; \ + if((fp)->format.compression != no_compression) r = bgzf_write((fp)->fp.bgzf, (s)->s, (s)->l); \ + else r = hwrite((fp)->fp.hfile, (s)->s, (s)->l); \ + if(r != (s)->l) error("output error writing to %s\n", (fp)->fn ? (fp)->fn : ""); \ +} \ + +const char *usage(void); +void handle_command_line(int argc, char *argv[], args_t * const args); +void init_params(args_t *const args); +void init_files(args_t *a); +void close_files(args_t *a); +void init_stats(args_t *a); +void write_stats(args_t *a); +void *unpack_bcf_thread(void *p); +void *read_thread(void * const p); +rec_t *rec_init(const int ns); +void *handle_rec_buf(void *p); +void *cpg_thread(void *p); +void *output_thread(void *p); +void calc_file_md5(char * const name); + +#endif // MEXTR_H_ diff --git a/tools/utils/output.c b/tools/utils/output.c new file mode 100644 index 00000000..66b9fc10 --- /dev/null +++ b/tools/utils/output.c @@ -0,0 +1,850 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "mextr.h" +#include "bbi.h" +#include "htslib/bgzf.h" +#include "htslib/hfile.h" +#include "htslib/tbx.h" + +void output_cpg(args_t *const args, rec_t ** const lrec, const int idx) { + static char *gt_iupac = "AMRWCSYGKT"; + static uint8_t gt_msk[] = {0x11, 0xb3, 0x55, 0x99, 0xa2, 0xf6, 0xaa, 0x54, 0xdc, 0x88}; + double **Q = args->sample_Q; + + htsFile *fp = args->cpgfile; + kstring_t *s = &args->pr_str[0]; + int ns = bcf_hdr_nsamples(args->hdr); + int min_n = args->min_num; + int n1 = (int)(args->min_prop * (double)ns + 0.5); + if(n1 > min_n) min_n = n1; + const rec_t * const rec1 = lrec[idx]; + const rec_t * const rec2 = lrec[idx ^ 1]; + if(fp != NULL) { + // Build up prob. distribution Q(i) where Q(i) = prob that i samples have genotype CG/CG + bool skip = true; + for(int ix = 0; ix < ns; ix++) { + gt_meth *g1 = rec1->sample_gt+ix, *g2 =rec2->sample_gt+ix; + double z = 0.0; + if(!(g1->skip || g2->skip)) { + if((g1->counts[5] + g1->counts[7] >= args->min_inform) || (g2->counts[6] + g1->counts[4] >= args->min_inform)) { + if(args->sel_mode == SELECT_HOM) { + z = exp(g1->gt_prob[4] + g2->gt_prob[7]); + if(g1->max_gt == 4 && g2->max_gt == 7) skip = false; + } else { + z = (exp(g1->gt_prob[1]) + exp(g1->gt_prob[4]) + exp(g1->gt_prob[5]) + exp(g1->gt_prob[6])) * + (exp(g2->gt_prob[2]) + exp(g2->gt_prob[5]) + exp(g2->gt_prob[7]) + exp(g2->gt_prob[8])); + if((g1->max_gt == 1 || (g1->max_gt >= 4 && g1->max_gt <= 6)) && + (g2->max_gt == 2 || g2->max_gt == 5 || g2->max_gt == 7 || g2->max_gt == 8)) skip = false; + } + } + } + Q[2][ix] = z; + } + double *p = get_prob_dist(ns, Q); + double z = p[0]; + for(int i = 1; i <= ns && i < min_n; i++) z += p[i]; + int phred = calc_phred(z); + if(!skip && phred >= args->sel_thresh) { + int cx_sz = rec1->tags[FMT_CX].ne / ns; + if(args->mode == CPGMODE_COMBINED) { + calc_cpg_meth(args, ns, args->sample_cpg, lrec[idx]->sample_gt, lrec[idx ^ 1]->sample_gt); + char ref[2] = {rec1->ref, rec2->ref}; + ksprintf(ks_clear(s), "%s\t%" PRId64 "\t%" PRId64 "\t%.2s", args->hdr->id[BCF_DT_CTG][rec1->rid].key, rec1->pos, rec1->pos + 2, ref); + char *cx_p = rec1->tags[FMT_CX].dat_p; + int *mq_p1 = rec1->tags[FMT_MQ].ne == ns ? rec1->tags[FMT_MQ].dat_p : NULL; + int *mq_p2 = rec2->tags[FMT_MQ].ne == ns ? rec2->tags[FMT_MQ].dat_p : NULL; + for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { + gt_meth *g1 = rec1->sample_gt+ix, *g2 =rec2->sample_gt+ix; + if(!(g1->skip || g2->skip)) { + int gq = calc_phred(1.0 - exp(g1->gt_prob[g1->max_gt] + g2->gt_prob[g2->max_gt])); // Prob. of not being called genotype + ksprintf(s, "\t%c%c\tGQ=%d", gt_iupac[g1->max_gt], gt_iupac[g2->max_gt], gq); + if(g1->max_gt != 4 || g2->max_gt != 7) { + int dq = calc_phred(exp(g1->gt_prob[4] + g2->gt_prob[7])); // Prob. of being CG + ksprintf(s, ";DQ=%d", dq); + } + int mq = -1; + if(mq_p1 != NULL) { + if(mq_p2 != NULL) { + double n1 = 0.0, n2 = 0.0; + for(int k = 0; k < 8; k++) { + n1 += (double)g1->counts[k]; + n2 += (double)g2->counts[k]; + } + if(n1 + n2 > 0.0) { + double mq1 = (double)mq_p1[ix]; + double mq2 = (double)mq_p2[ix]; + mq = (int32_t)(0.5 + sqrt((mq1 * mq1 * n1 + mq2 * mq2 + n2) / (n1 + n2))); + } + } else mq = mq_p1[ix]; + } else if(mq_p2 != NULL) mq = mq_p2[ix]; + if(mq >= 0) ksprintf(s, ";MQ=%d", mq); + int32_t ct[4]; + ct[0] = g1->counts[5] + g2->counts[6]; + ct[1] = g1->counts[7] + g2->counts[4]; + ct[2] = ct[3] = 0; + uint8_t m = 1; + uint8_t msk1 = gt_msk[g1->max_gt]; + uint8_t msk2 = gt_msk[g2->max_gt]; + for(int i = 0; i < 8; i++, m <<= 1) { + ct[3] += g1->counts[i] + g2->counts[i]; + if(msk1 & m) ct[2] += g1->counts[i]; + if(msk2 & m) ct[2] += g2->counts[i]; + } + ksprintf(s, "\t%.3f\t%d\t%d\t%d\t%d", args->sample_cpg[ix].m, ct[0], ct[1], ct[2], ct[3]); + } else { + kputs("\t.\t.\t.\t.\t.\t.\t.", s); + } + } + kputc('\n', s); + } else { + for(int pos = 0; pos < 2; pos++) { + rec_t *rec = lrec[idx ^ pos]; + int *mq_p = rec->tags[FMT_MQ].ne == ns ? rec->tags[FMT_MQ].dat_p : NULL; + ksprintf(ks_clear(s), "%s\t%" PRId64 "\t%" PRId64 " \t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, rec->ref); + char *cx_p = rec->tags[FMT_CX].dat_p; + for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { + gt_meth *g = rec->sample_gt+ix; + if(!g->skip) { + int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype + ksprintf(s, "\t%c\tGQ=%d", gt_iupac[g->max_gt], gq); + if(g->max_gt != (pos ? 7 : 4)) { + int dq = calc_phred(exp(g->gt_prob[pos ? 7 : 4])); // Prob. of being CG + ksprintf(s, ";DQ=%d", dq); + } + int mq = -1; + if(mq_p != NULL) mq = mq_p[ix]; + if(mq >= 0) ksprintf(s, ";MQ=%d", mq); + int32_t ct[4]; + if(pos) { + ct[0] = g->counts[6]; + ct[1] = g->counts[4]; + } else { + ct[0] = g->counts[5]; + ct[1] = g->counts[7]; + } + ct[2] = ct[3] = 0; + uint8_t m = 1; + uint8_t msk = gt_msk[g->max_gt]; + for(int i = 0; i < 8; i++, m <<= 1) { + ct[3] += g->counts[i]; + if(msk & m) ct[2] += g->counts[i]; + } + double meth = get_meth(g, pos); + ksprintf(s, "\t%.3f\t%d\t%d\t%d\t%d", meth, ct[0], ct[1], ct[2], ct[3]); + } else { + kputs("\t.\t.\t.\t.\t.\t.\t.", s); + } + } + kputc('\n', s); + } + } + ks_output(fp, s); + } + } +} + +void output_noncpg(args_t *const args, const rec_t * const rec) { + if(!rec) return; + static char *gt_iupac = "AMRWCSYGKT"; + static uint8_t gt_msk[] = {0x11, 0xb3, 0x55, 0x99, 0xa2, 0xf6, 0xaa, 0x54, 0xdc, 0x88}; + double **Q = args->sample_Q1; + kstring_t *s = &args->pr_str[1]; + int ns = bcf_hdr_nsamples(args->hdr); + int min_n = args->min_num; + int n1 = (int)(args->min_prop * (double)ns + 0.5); + if(n1 > min_n) min_n = n1; + htsFile *fp = args->noncpgfile; + assert(fp != NULL); + bool cstrand = true; + uint8_t gt = rec->max_common_gt; + if(gt == 7 || gt == 2 || gt == 8) cstrand = false; + else if(gt == 5 && rec->ref == 'G') cstrand = false; + for(int ix = 0; ix < ns; ix++) { + double z = 0.0; + gt_meth *g = rec->sample_gt + ix; + if(!g->skip) { + if(cstrand) { + if(g->counts[5] >= args->min_nc && (g->counts[5] + g->counts[7] >= args->min_inform)) { + if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[4]); + else z = exp(g->gt_prob[1]) + exp(g->gt_prob[4]) + exp(g->gt_prob[5]) + exp(g->gt_prob[6]); + } + } else { + if(g->counts[6] >= args->min_nc && (g->counts[6] + g->counts[4] >= args->min_inform)) { + if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[7]); + else z = exp(g->gt_prob[2]) + exp(g->gt_prob[5]) + exp(g->gt_prob[7]) + exp(g->gt_prob[8]); + } + } + } + Q[2][ix] = z; + } + double *p = get_prob_dist(ns, Q); + double z = p[0]; + for(int i = 1; i <= ns && i < min_n; i++) z += p[i]; + int phred = calc_phred(z); + if(phred >= args->sel_thresh) { + int cx_sz = rec->tags[FMT_CX].ne / ns; + int *mq_p = rec->tags[FMT_MQ].ne == ns ? rec->tags[FMT_MQ].dat_p : NULL; + ksprintf(ks_clear(s),"%s\t%"PRId64"\t%"PRId64"\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, rec->ref); + char *cx_p = rec->tags[FMT_CX].dat_p; + for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { + gt_meth *g = rec->sample_gt + ix; + if(!g->skip) { + int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype + ksprintf(s, "\t%c\tGQ=%d", gt_iupac[g->max_gt], gq); + if(g->max_gt != (cstrand ? 4 : 7)) { + int dq = calc_phred(exp(g->gt_prob[cstrand ? 4 : 7])); // Prob. of being CG + ksprintf(s, ";DQ=%d", dq); + } + int mq = -1; + if(mq_p != NULL) mq = mq_p[ix]; + if(mq >= 0) ksprintf(s, ";MQ=%d", mq); + if(cx_sz >= 5) ksprintf(s, ";CX=%.3s", cx_p + 2); + int32_t ct[4]; + if(!cstrand) { + ct[0] = g->counts[6]; + ct[1] = g->counts[4]; + } else { + ct[0] = g->counts[5]; + ct[1] = g->counts[7]; + } + ct[2] = ct[3] = 0; + uint8_t m = 1; + uint8_t msk = gt_msk[g->max_gt]; + for(int i = 0; i < 8; i++, m <<= 1) { + ct[3] += g->counts[i]; + if(msk & m) ct[2] += g->counts[i]; + } + double meth = get_meth(g, !cstrand); + ksprintf(s, "\t%g\t%d\t%d\t%d\t%d", meth, ct[0], ct[1], ct[2], ct[3]); + } else { + kputs("\t.\t.\t.\t.\t.\t.\t.\t.", s); + } + } + kputc('\n', s); + ks_output(fp, s); + } +} + +static char *rgb_tab[11] = { "0,255,0", "55,255,0", "105,255,0", "155,255,0", "205,255,0", "255,255,0", + "255,205,0", "255,155,0", "255,105,0", "255,55,0", "255,0,0" }; + +void output_bedmethyl(args_t *const args, const rec_t * const rec) { + static int32_t old_rid = -1; + static int64_t old_pos = -1; + + if(!rec) { + // End of input + // Flush buffers + if(old_rid >= 0) { + int id1 = args->id_trans[old_rid]; + assert(id1 >= 0); + finish_bbi_blocks(args, id1); + } + return; + } + kstring_t *s = ks_clear(&args->pr_str[2]); + if(rec->rid == old_rid && rec->pos <= old_pos) return; + int ns = bcf_hdr_nsamples(args->hdr); + if(ns > 1) return; + gt_meth *g = rec->sample_gt; + + if(!g->skip) { + char strand; + if(rec->ref == 'C') strand = '+'; + else if(rec->ref == 'G') strand = '-'; + else return; + + char *cx_p = rec->tags[FMT_CX].dat_p; + int cx_sz = rec->tags[FMT_CX].ne; + char rtmp[8]; + if(strand == '+') { + int k; + for(k = 0; k < 3; k++) rtmp[k] = rec->cx[k + 2]; + for(k = 0; k < 3 && k < cx_sz - 2; k++) rtmp[k + 4] = cx_p[k + 2]; + for(;k < 3; k++) rtmp[k + 4] = 'N'; + } else { + int k; + for(k = 0; k < 3; k++) rtmp[2 - k] = trans_base[(int)rec->cx[k]]; + for(k = 0; k < 3 && k < cx_sz; k++) rtmp[6 - k] = trans_base[(int)cx_p[k]]; + for(;k < 3; k++) rtmp[6 - k] = 'N'; + } + bedmethyl_type btype = BEDMETHYL_NONE; + assert(rtmp[0] == 'C'); + if(rtmp[1] == 'G') { + btype = BEDMETHYL_CPG; + rtmp[2] = rtmp[6] = 0; + } else { + btype = rtmp[2] == 'G' ? BEDMETHYL_CHG : BEDMETHYL_CHH; + rtmp[3] = rtmp[7] = 0; + } + if(btype != BEDMETHYL_NONE) { + int32_t ct[2]; + if(strand == '-') { + ct[0] = g->counts[6]; + ct[1] = g->counts[4]; + } else { + ct[0] = g->counts[5]; + ct[1] = g->counts[7]; + } + int32_t cov = ct[0] + ct[1]; + double m = cov > 0 ? (double)ct[0] / (double)cov : 0.0; + + if(cov > 0) { + htsFile *fp = args->bedmethylfiles[btype - 1]; + assert(fp); + int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype + ksprintf(s, "%s\t%"PRId64"\t%"PRId64"\t", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1); + size_t l = s->l; + ksprintf(s, "\"%s\"\t%d\t%c\t%"PRId64"\t%"PRId64"\t%s\t%d\t%d\t%s\t%s\t%d\n", args->bedmethyl_desc, cov > 1000 ? 1000 : cov, + strand, rec->pos, rec->pos + 1, rgb_tab[(int)(m * 10.0 + 0.5)], cov, (int)(100.0 * m), rtmp, rtmp + 4, gq); + size_t sz_fields = s->l - l - 1; + ks_output(fp, s); + int id = args->id_trans[rec->rid]; + assert(id >= 0); + bbi_ctg_data_t * const cdata = args->ctg_data + id; + if(rec->rid != old_rid) { + if(old_rid >= 0) { + int id1 = args->id_trans[old_rid]; + assert(id1 >= 0); + finish_bbi_blocks(args, id1); + } + old_rid = rec->rid; + for(int j = 0; j < (args->strand_specific ? 5 : 4); j++) { + for(int i = 0; i < ZOOM_LEVELS; i++) args->bb_global[j].res_end[i] = 0; + } + } + + // + // First handle bigBed data + // + bbi_data_t * bdata = args->ctg_data[id].bbi_data + btype - 1; + + // Check if this is a new block + if(bdata->n_items == 0) { + bdata->bbuf.offset = 0; + bdata->bbuf.start = rec->pos; + bdata->bbuf.end = rec->pos + 1; + } + + // Add data to current block + uint32_t dt[3] = {id, rec->pos, rec->pos + 1}; + kstring_t *s1 = args->bb_global[btype - 1].buffer; + args->bb_global[btype - 1].n_rec++; + const uint32_t * zoom_scales = args->bb_global[btype - 1].zoom_scales; + kputsn_((char *)dt, sizeof(uint32_t) * 3, s1); + kputsn(s->s + l, sz_fields, s1); + s1->l++; + bdata->bbuf.end = rec->pos + 1; + // Compress and write out block if full + if(++(bdata->n_items) >= ITEMS_PER_SLOT) finish_bb_block(args, id, btype - 1); + + // Collect count data for zoom levels + + uint32_t * res_end = args->bb_global[btype - 1].res_end; + uint32_t * res_size = args->bb_global[btype - 1].res_size; + for(int i = 0; i < ZOOM_LEVELS; i++) { + if(rec->pos >= res_end[i]) { + res_size[i]++; + res_end[i] = rec->pos + zoom_scales[i]; + } + } + + // + // And now bigWig data + // + const int k = (!args->strand_specific || strand == '+') ? 3 : 4; + bdata = args->ctg_data[id].bbi_data + k; + s = args->bb_global[k].buffer; + zoom_scales = args->bb_global[k].zoom_scales; + bw_rec_t * const bw_rec = bdata->bw_rec + bdata->n_items; + bw_rec->start = rec->pos; + bw_rec->val = 100.0 * m; + // Compress and write out block if full + if(++(bdata->n_items) >= BW_ITEMS_PER_SLOT) finish_bw_block(args, id, k - 3); + + // Collect data for zoom levels + res_end = args->bb_global[k].res_end; + res_size = args->bb_global[k].res_size; + for(int i = 0; i < ZOOM_LEVELS; i++) { + if(rec->pos >= res_end[i]) { + res_size[i]++; + res_end[i] = rec->pos + zoom_scales[i]; + } + } + // + // And now collect detailed Zoom data (for both bigBed and bigWig + // + zoom_dt_t * const zd = &cdata->zoom_data; + int ix = rec->pos >> 1; + uint8_t base_type = btype; + if(strand == '-') base_type |= 4; + if(ct[0] > 0) { + base_type |= 8; + hts_resize(float, zd->val_ix + 1, &zd->val_size, &zd->val, 0); + zd->val[zd->val_ix++] = 100.0 * m; + } + if(!(rec->pos & 1)) base_type <<= 4; + zd->base_type[ix] |= base_type; + old_pos = rec->pos; + } + } + } +} + +int add_bb_zoom_data_item(const uint32_t ct, const int ctg, const int ix, const uint32_t start, const uint32_t end, args_t * const args) { + int ret = 0; + if(ct > 0) { + const float fct = (float)ct; + ret = 1; + bbi_data_t * const bdata = args->ctg_data[ctg].bbi_data + ix; + // Check if this is a new block + if(bdata->n_items == 0) { + bdata->bbuf.offset = 0; + bdata->bbuf.start = start; + bdata->bbuf.end = end; + } + // Add data to current block + uint32_t dt[4] = {ctg, start, end, ct}; + float dt1[4] = {1.0, 1.0, fct, fct}; + kstring_t *s = args->bb_global[ix].buffer; + kputsn_((char *)dt, sizeof(uint32_t) * 4, s); + kputsn_((char *)dt1, sizeof(float) * 4, s); + bdata->bbuf.end = end; + // Compress and write out block if full + if(++(bdata->n_items) >= ITEMS_PER_SLOT) finish_bb_block(args, ctg, ix); + } + return ret; +} + +int add_bw_zoom_data_item(bw_zrec_t * const bwr, const int ctg, const int ix, uint32_t scale, args_t * const args) { + int ret = 0; + if(bwr->count > 0) { + ret = 1; + bbi_data_t * const bdata = args->ctg_data[ctg].bbi_data + ix + 3; + uint32_t end = bwr->end_base; + uint32_t start = end - scale; + // Check if this is a new block + if(bdata->n_items == 0) { + bdata->bbuf.offset = 0; + bdata->bbuf.start = start; + bdata->bbuf.end = end; + } + // Add data to current block + uint32_t dt[4] = {ctg, start, end, bwr->count}; + float dt1[4] = {bwr->min, bwr->max, bwr->x, bwr->xsq}; + kstring_t *s = args->bb_global[ix + 3].buffer; + kputsn_((char *)dt, sizeof(uint32_t) * 4, s); + kputsn_((char *)dt1, sizeof(float) * 4, s); + bdata->bbuf.end = end; + // Compress and write out block if full + if(++(bdata->n_items) >= BW_ITEMS_PER_SLOT) finish_bb_block(args, ctg, ix + 3); + } + return ret; +} + +void add_bb_zrec(const uint32_t ct, const int ctg, const int zoom_level, const int ix, const uint32_t end, args_t * const args) { + assert(zoom_level > 0); + if(ct > 0) { + bbi_zblock_t * const bzb = args->ctg_data[ctg].bbi_data[ix].zblock + zoom_level - 1; + hts_resize(bb_zrec_t, bzb->ix + 1, &bzb->size, &bzb->bb_rec, 0); + bb_zrec_t * const zrec = bzb->bb_rec + (bzb->ix++); + zrec->end_base = end; + zrec->count = ct; + } +} + +void add_bw_zrec(bw_zrec_t * const bwr, const int ctg, const int zoom_level, const int ix, uint32_t scale, args_t * const args) { + assert(zoom_level > 0); + if(bwr->count > 0) { + bbi_zblock_t * const bzb = args->ctg_data[ctg].bbi_data[ix + 3].zblock + zoom_level - 1; + hts_resize(bw_zrec_t, bzb->ix + 1, &bzb->size, &bzb->bw_rec, 0); + bw_zrec_t * const zrec = bzb->bw_rec + (bzb->ix++); + memcpy(zrec, bwr, sizeof(bw_zrec_t)); + } +} + +void process_zoom_rec(int ctg, uint32_t pos, uint8_t x, uint32_t *count, uint32_t ct[ZOOM_LEVELS][3], uint32_t end[ZOOM_LEVELS][3], + bw_zrec_t bwr[ZOOM_LEVELS][2], float * const val, int * const val_ix, args_t * const args) { + + const int ix = (x & 3) - 1; + const int ix1 = (args->strand_specific && ((x & 4)) ? 1 : 0); + const uint32_t * scales = args->bb_global[ix].zoom_scales; + const uint32_t * scales1 = args->bb_global[ix1 + 3].zoom_scales; + args->bb_global[ix].total_bases++; + args->bb_global[ix1 + 3].total_bases++; + float m = 0.0; + if(x & 8) { + m = val[(*val_ix)++]; + double m1 = m; + if(m1 < args->bb_global[ix1 + 3].min_x) args->bb_global[ix1 + 3].min_x = m1; + if(m1 > args->bb_global[ix1 + 3].max_x) args->bb_global[ix1 + 3].max_x = m1; + args->bb_global[ix1 + 3].sum_x += m1; + args->bb_global[ix1 + 3].sum_xsq += m1 * m1; + } else args->bb_global[ix1 + 3].min_x = 0.0; + for(int k = 0; k < ZOOM_LEVELS; k++) { + // bigBed zoom data + if(pos >= end[k][ix]) { + if(k == 0) count[ix] += add_bb_zoom_data_item(ct[0][ix], ctg, ix, end[0][ix] - scales[0], end[0][ix], args); + else add_bb_zrec(ct[k][ix], ctg, k, ix, end[k][ix], args); + ct[k][ix] = 1; + end[k][ix] = pos + scales[k]; + } else ct[k][ix]++; + // bigWig zoom data + if(pos >= bwr[k][ix1].end_base) { + if(k == 0) count[ix1 + 3] += add_bw_zoom_data_item(&bwr[k][ix1], ctg, ix1, scales1[0], args); + else add_bw_zrec(&bwr[k][ix1], ctg, k, ix1, scales1[k], args); + bwr[k][ix1].end_base = pos + scales1[k]; + bwr[k][ix1].count = 1; + bwr[k][ix1].min = bwr[k][ix1].max = bwr[k][ix1].x = m; + bwr[k][ix1].xsq = m * m; + } else { + bwr[k][ix1].count++; + bwr[k][ix1].x += m; + bwr[k][ix1].xsq += m * m; + if(m < bwr[k][ix1].min) bwr[k][ix1].min = m; + else if(m > bwr[k][ix1].max) bwr[k][ix1].max = m; + } + } +} + +void make_tabix_index(char * const fname) { + tbx_conf_t conf = tbx_conf_bed; + conf.line_skip = 1; + tbx_index_build(fname, 0, &conf); +} + +void *md5_thread(void *p) { + calc_file_md5(p); + return NULL; +} + +void *cpg_thread(void *p) { + args_t * args = p; + rec_buffer_t * const rec_buf = &args->rec_buf; + rec_t *lrec[2] = {NULL, NULL}; + const struct timespec wt = {0, 5000}; + uint64_t curr_idx = 0; + int ix = 0; + pthread_mutex_lock(&rec_buf->mut); + while(true) { + while(curr_idx - rec_buf->first_index >= REC_BUF_SIZE){ + pthread_mutex_unlock(&rec_buf->mut); + nanosleep(&wt, NULL); + pthread_mutex_lock(&rec_buf->mut); + } + rec_t *rec = rec_buf->buf[curr_idx - rec_buf->first_index]; + while(!args->rec_finished && !(rec->tasks & REC_READY)) { + pthread_mutex_unlock(&rec_buf->mut); + nanosleep(&wt, NULL); + pthread_mutex_lock(&rec_buf->mut); + } + if(!(rec->tasks & REC_READY) && args->rec_finished) break; + rec_t * const rec1 = lrec[ix ^ 1]; + if((rec->tasks & (RJ_OUTPUT_CPG | REC_SKIP)) == RJ_OUTPUT_CPG) { + lrec[ix] = rec; + if(rec1) { + // Are the entries consecutive ? + if(rec1->rid == rec->rid && rec1->pos + 1 == rec->pos) { + pthread_mutex_unlock(&rec_buf->mut); + output_cpg(args, lrec, ix ^ 1); + pthread_mutex_lock(&rec_buf->mut); + } + rec1->tasks &= ~RJ_OUTPUT_CPG; + } + } else { + lrec[ix] = NULL; + rec->tasks &= ~RJ_OUTPUT_CPG; + if(rec1) rec1->tasks &= ~RJ_OUTPUT_CPG; + } + curr_idx++; + ix ^= 1; + } + if(lrec[ix ^ 1]) lrec[ix ^ 1]->tasks &= ~RJ_OUTPUT_CPG; + pthread_mutex_unlock(&rec_buf->mut); + hts_close(args->cpgfile); + args->cpgfile = NULL; + pthread_t md5_th; + if(args->calc_md5) pthread_create(&md5_th, NULL, md5_thread, args->cpgfilename); + if(args->tabix) make_tabix_index(args->cpgfilename); + if(args->calc_md5) pthread_join(md5_th, NULL); + return NULL; +} + +void *output_thread(void *p) { + thr_info_t *th = p; + args_t * args = th->args; + rec_buffer_t * const rec_buf = &args->rec_buf; + const struct timespec wt = {0, 5000}; + uint64_t curr_idx = 0; + pthread_mutex_lock(&rec_buf->mut); + while(true) { + while(curr_idx - rec_buf->first_index >= REC_BUF_SIZE){ + pthread_mutex_unlock(&rec_buf->mut); + nanosleep(&wt, NULL); + pthread_mutex_lock(&rec_buf->mut); + } + rec_t *rec = rec_buf->buf[curr_idx - rec_buf->first_index]; + while(!args->rec_finished && !(rec->tasks & REC_READY)) { + pthread_mutex_unlock(&rec_buf->mut); + nanosleep(&wt, NULL); + pthread_mutex_lock(&rec_buf->mut); + } + if(!(rec->tasks & REC_READY) && args->rec_finished) break; + if((rec->tasks & (th->job_flag | REC_SKIP)) == th->job_flag) { + pthread_mutex_unlock(&rec_buf->mut); + th->output(args, rec); + pthread_mutex_lock(&rec_buf->mut); + } + curr_idx++; + rec->tasks &= ~th->job_flag; + } + pthread_mutex_unlock(&rec_buf->mut); + // Indicate end of input for bbi output + th->output(args, NULL); + if(th->job_flag == RJ_OUTPUT_NONCPG) { + hts_close(args->noncpgfile); + args->noncpgfile = NULL; + args->cpgfile = NULL; + pthread_t md5_th; + if(args->calc_md5) pthread_create(&md5_th, NULL, md5_thread, args->noncpgfilename); + if(args->tabix) make_tabix_index(args->noncpgfilename); + if(args->calc_md5) pthread_join(md5_th, NULL); + } else { + for(int i = 0; i < 3; i++) { + hts_close(args->bedmethylfiles[i]); + args->bedmethylfiles[i] = NULL; + } + if(args->calc_md5) { + pthread_t md5_th[3]; + for(int i = 0; i < 3; i++) pthread_create(md5_th + i, NULL, md5_thread, args->bedmethylnames[i]); + for(int i = 0; i < 3; i++) pthread_join(md5_th[i], NULL); + } + } + return NULL; +} + +void *handle_bedmethyl_thread(void *p) { + thr_info_t *th = p; + args_t * args = th->args; + pthread_t write_th, *compress_th; + + // + // Main data output + // + int nb = args->compress_threads; + compress_th = malloc(nb * sizeof(pthread_t)); + for(int i = 0; i < nb; i++) pthread_create(compress_th + i, NULL, bbi_compress_thread, args); + pthread_create(&write_th, NULL, bbi_write_thread, args); + output_thread(p); + // Signal compression and write threads that there is no more input + args->cblock_buf.end_of_input = true; + // Wake up any compress or writing threads that are waiting for new input + pthread_cond_broadcast(&args->cblock_buf.cond[1]); + pthread_cond_broadcast(&args->cblock_buf.cond[2]); + + // Wait for compression and writing threads to complete + for(int i = 0; i < args->compress_threads; i++) pthread_join(compress_th[i], NULL); + pthread_join(write_th, NULL); + // Store current file position of each file + for(int i = 0; i < 3; i++) finish_bb_data_file(args, i); + for(int i = 0; i < (args->strand_specific ? 2 : 1); i++) finish_bw_data_file(args, i); + + // + // Create main index + // + fprintf(stderr, "Creating main indices\n"); + pthread_t bbi_index_thr[5]; + bbi_thr_info_t bbi_tinfo[5]; + int njobs = args->strand_specific ? 5 : 4; + for(int ix = 0; ix < njobs; ix++) { + FILE * const fp = ix < 3 ? args->bigbedfiles[ix] : args->bigwigfiles[ix - 3]; + fseek(fp, args->bbi_hdr[ix < 3 ? 0 : 1]->fullDataOffset, SEEK_SET); + bbi_write(fp, args->bb_global[ix].n_rec); + fseek(fp, args->bb_global[ix].index_offset, SEEK_SET); + bbi_tinfo[ix].args = args; + bbi_tinfo[ix].ix = ix; + bbi_tinfo[ix].nrec = args->bb_global[ix].n_rec; + pthread_create(bbi_index_thr + ix, NULL, bbi_create_index, bbi_tinfo + ix); + } + for(int i = 0; i < njobs; i++) pthread_join(bbi_index_thr[i], NULL); + + // + // Create Zoom data & indices + // + + // Initialize summary block + for(int ix = 0; ix < njobs; ix++) { + args->bb_global[ix].total_bases = 0; + args->bb_global[ix].min_x = DBL_MAX; + args->bb_global[ix].max_x = -DBL_MAX; + args->bb_global[ix].sum_x = 0.0; + args->bb_global[ix].sum_xsq = 0.0; + } + const bool tty = isatty(STDERR_FILENO); + const uint64_t total_len = args->cumul_len[args->sr->regions->nseqs - 1]; + fprintf(stderr, "Creating zoom levels (%d)\n", ZOOM_LEVELS); + for(int i = 0; i < ZOOM_LEVELS; i++) { + clear_cblocks(args);; + for(int ix = 0; ix < njobs; ix++) { + FILE *fp = ix < 3 ? args->bigbedfiles[ix] : args->bigwigfiles[ix - 3]; + args->bb_global[ix].zoom_data_offset[i] = ftell(fp); + bbi_write(fp, args->bb_global[ix].res_size[i]); + } + for(int k = 0; k < nb; k++) pthread_create(compress_th + k, NULL, bbi_compress_thread, args); + pthread_create(&write_th, NULL, bbi_write_thread, args); + uint32_t count[5] = {0, 0, 0, 0, 0}; + int old_complete = 0; + uint64_t ctot = 0; + for(int ctg = 0; ctg < args->sr->regions->nseqs; ctg++) { + if(i == 0) { + uint32_t end[ZOOM_LEVELS][3]; + uint32_t ct[ZOOM_LEVELS][3]; + bw_zrec_t bwr[ZOOM_LEVELS][2]; + for(int j = 0; j < ZOOM_LEVELS; j++) { + for(int k = 0; k < 3; k++) end[j][k] = ct[j][k] = 0; + for(int k = 0; k < 2; k++) memset(&bwr[j][k], 0, sizeof(bw_zrec_t)); + } + zoom_dt_t * const zd = &args->ctg_data[ctg].zoom_data; + int val_ix = 0; + float * const val = zd->val; + const uint8_t * tb = zd->base_type; + uint32_t j_limit = (zd->len + 1) >> 1; + for(uint32_t j = 0; j < j_limit; j++) { + for(; !(*tb) && j < j_limit; j++, tb++); + if(j == j_limit) break; + const int pos = j << 1; + const uint8_t x = *tb++; + if(x & 0x30) process_zoom_rec(ctg, pos, x >> 4, count, ct, end, bwr, val, &val_ix, args); + if(x & 3) process_zoom_rec(ctg, pos + 1, x, count, ct, end, bwr, val, &val_ix, args); + if(tty) { + const uint64_t tot = pos + 2 + ctot; + int complete = (int)((tot * 100) / total_len); + if(complete > old_complete) { + old_complete = complete; + fprintf(stderr,"Zoom level %d: %d%% complete \r", i + 1, complete); + } + } + } + assert(val_ix == zd->val_ix); + for(int ix = 0; ix < 3; ix++) { + count[ix] += add_bb_zoom_data_item(ct[0][ix], ctg, ix, end[0][ix] - args->bb_global[ix].zoom_scales[0], end[0][ix], args); + finish_bb_block(args, ctg, ix); + } + for(int ix = 0; ix < (args->strand_specific ? 2 : 1); ix++) { + count[ix + 3] += add_bw_zoom_data_item(&bwr[0][ix], ctg, ix, args->bb_global[ix + 3].zoom_scales[0], args); + finish_bb_block(args, ctg, ix + 3); + } + for(int k = 1; k < ZOOM_LEVELS; k++) { + for(int ix = 0; ix < 3; ix++) { + add_bb_zrec(ct[k][ix], ctg, k, ix, end[k][ix], args); + } + for(int ix = 0; ix < (args->strand_specific ? 2 : 1); ix++) { + add_bw_zrec(&bwr[k][ix], ctg, k, ix, args->bb_global[ix + 3].zoom_scales[k], args); + } + } + free(zd->base_type); + if(zd->val) free(zd->val); + zd->base_type = NULL; + zd->val = NULL; + zd->val_ix = zd->val_size = 0; + } else { + // Handle the subsequent zoom levels + + // For bigBed files + for(int ix = 0; ix < 3; ix++) { + bbi_zblock_t * const bzb = args->ctg_data[ctg].bbi_data[ix].zblock + i - 1; + const uint32_t scale = args->bb_global[ix].zoom_scales[i]; + bb_zrec_t * zrec = bzb->bb_rec; + if(zrec) { + for(int j = 0; j < bzb->ix; j++, zrec++) { + count[ix] += add_bb_zoom_data_item(zrec->count, ctg, ix, zrec->end_base - scale, zrec->end_base, args); + } + finish_bb_block(args, ctg, ix); + free(bzb->bb_rec); + bzb->bb_rec = NULL; + bzb->ix = bzb->size = 0; + } + } + // and for bigWig files + for(int ix = 0; ix < (args->strand_specific ? 2 : 1); ix++) { + bbi_zblock_t * const bzb = args->ctg_data[ctg].bbi_data[ix + 3].zblock + i - 1; + const uint32_t scale = args->bb_global[ix + 3].zoom_scales[i]; + bw_zrec_t * zrec = bzb->bw_rec; + if(zrec) { + for(int j = 0; j < bzb->ix; j++, zrec++) { + count[ix + 3] += add_bw_zoom_data_item(zrec, ctg, ix, scale, args); + } + finish_bb_block(args, ctg, ix + 3); + free(bzb->bw_rec); + bzb->bw_rec = NULL; + bzb->ix = bzb->size = 0; + } + } + } + ctot = args->cumul_len[ctg]; + if(tty) { + const uint64_t tot = ctot; + int complete = (int)((tot * 100) / total_len); + if(complete > old_complete) { + old_complete = complete; + fprintf(stderr,"Zoom level %d: %d%% complete\r", i + 1, complete); + } + } + } + // Signal compression and write threads that there is no more input + args->cblock_buf.end_of_input = true; + pthread_cond_broadcast(&args->cblock_buf.cond[1]); + pthread_cond_broadcast(&args->cblock_buf.cond[2]); + // Wait for compression and writing threads to complete + for(int j = 0; j < args->compress_threads; j++) pthread_join(compress_th[j], NULL); + pthread_join(write_th, NULL); + for(int j = 0; j < njobs; j++) { + assert(args->bb_global[j].res_size[i] == count[j]); + } + pthread_t bbi_index_thr[5]; + bbi_thr_info_t bbi_tinfo[5]; + for(int ix = 0; ix < njobs; ix++) { + args->bb_global[ix].zoom_index_offset[i] = ftell(ix < 3 ? args->bigbedfiles[ix] : args->bigwigfiles[ix - 3]); + bbi_tinfo[ix].args = args; + bbi_tinfo[ix].ix = ix; + bbi_tinfo[ix].nrec = count[ix]; + pthread_create(bbi_index_thr + ix, NULL, bbi_create_index, bbi_tinfo + ix); + } + for(int ix = 0; ix < njobs; ix++) pthread_join(bbi_index_thr[ix], NULL); + } + destroy_cblocks(args); + free(compress_th); + + // Write headers, summary etc with correct values + for(int ix = 0; ix < njobs; ix++) { + FILE * const fp = (ix < 3 ? args->bigbedfiles[ix] : args->bigwigfiles[ix - 3]); + bbi_header_t * const h = args->bbi_hdr[ix < 3 ? 0 : 1]; + h->fullIndexOffset = args->bb_global[ix].index_offset; + h->uncompressBufSize = args->bb_global[ix].max_buf_size; + if(ix < 3) { + double x = args->bb_global[ix].total_bases; + args->bb_global[ix].min_x = args->bb_global[ix].max_x = 1.0; + args->bb_global[ix].sum_x = args->bb_global[ix].sum_xsq = x; + } + write_bbi_header(fp, h, args->bb_global + ix); + fseek(fp, 0, SEEK_END); + bbi_write(fp, h->magic); + fclose(fp); + } + fprintf(stderr,"Zoom level %d: 100%% complete\nCalculating md5 sums of output files\n", ZOOM_LEVELS); + if(args->calc_md5) { + pthread_t md5_th[5]; + for(int ix = 0; ix < njobs; ix++) { + char * const name = ix > 3 ? args->bigbednames[ix] : args->bigwignames[ix - 3]; + pthread_create(md5_th + ix, NULL, md5_thread, name); + } + for(int ix = 0; ix < njobs; ix++) pthread_join(md5_th[ix], NULL); + } + return NULL; +} diff --git a/tools/utils/output_headers.c b/tools/utils/output_headers.c new file mode 100644 index 00000000..ce61cbb1 --- /dev/null +++ b/tools/utils/output_headers.c @@ -0,0 +1,90 @@ +/* + * output_headers.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#include +#include +#include +#include + +#include "mextr.h" +#include "htslib/bgzf.h" +#include "htslib/hfile.h" + +static char *copy_and_strip_quotes(char *s) { + if(!s) return s; + size_t l = strlen(s); + if(l > 1) { + if((s[0] == '\"' && s[l-1] =='\"') || (s[0] == '\'' && s[l-1] =='\'')) { + (s++)[--l] = 0; + } + } + char *s1 = malloc(l + 1); + if(s1 != NULL) memcpy(s1, s, l + 1); + return s1; +} + +static void print_file_header(htsFile *fp, kstring_t *s, int ns, char **names) { + if(fp != NULL) { + kputs("Contig\tPos0\tPos1\tRef", ks_clear(s)); + for(int i = 0; i < ns; i++) { + char *name = names[i]; + ksprintf(s, "\t%s:Call\t%s:Flags\t%s:Meth\t%s:non_conv\t%s:conv\t%s:support_call\t%s:total", name, name, name, name, name, name, name); + } + kputc('\n', s); + ks_output(fp, s); + } +} + +static void print_bedmethyl_headers(args_t *args) { + kstring_t *s = ks_clear(&args->pr_str[2]); + if(args->bedmethyl_track_line == NULL) { + char *sample_name = NULL; + char *sample_desc = NULL; + char *sample_bc = NULL; + // Try and get sample info from VCF file headers + bcf_hdr_t *h = args->hdr; + for(int i = 0; i < h->nhrec; i++) { + bcf_hrec_t *hr = h->hrec[i]; + if(hr->type == BCF_HL_STR) { + if(!strcmp(hr->key, "bs_call_sample_info")) { + int ix = bcf_hrec_find_key(hr, "ID"); + if(ix >= 0) { + sample_bc = copy_and_strip_quotes(hr->vals[ix]); + ix = bcf_hrec_find_key(hr, "SM"); + if(ix >= 0) sample_name = copy_and_strip_quotes(hr->vals[ix]); + ix = bcf_hrec_find_key(hr, "DS"); + if(ix >= 0) sample_desc = copy_and_strip_quotes(hr->vals[ix]); + } + } + } + } + if(sample_name == NULL) sample_name = strdup(h->samples[0]); + if(sample_desc == NULL) sample_desc = strdup(sample_name); + ksprintf(s, "track name=\"%s\" description=\"%s\" visibility=2 itemRgb=\"On\"\n", sample_desc, sample_name); + if(sample_bc) free(sample_bc); + if(sample_name) free(sample_name); + args->bedmethyl_desc = sample_desc; + } else { + char *line = args->bedmethyl_track_line; + size_t l = strlen(line); + if(l > 1 && line[l - 1] == '\n') line[--l] = 0; + if(!strncmp(line, "track ", 6)) line += 6; + ksprintf(s, "track %s\n", line); + } + for(bedmethyl_type t = BEDMETHYL_CPG; t <= BEDMETHYL_CHH; t++) { + htsFile *fp = args->bedmethylfiles[t - 1]; + if(fp != NULL) ks_output(fp, s); + } +} + +void print_headers(args_t *args) { + int ns = bcf_hdr_nsamples(args->hdr); + if(args->cpgfile) print_file_header(args->cpgfile, &args->pr_str[0], ns, args->hdr->samples); + if(args->noncpgfile) print_file_header(args->noncpgfile, &args->pr_str[1], ns, args->hdr->samples); + if(args->bedmethyl) print_bedmethyl_headers(args); +} + diff --git a/tools/utils/output_utils.c b/tools/utils/output_utils.c new file mode 100644 index 00000000..1da68166 --- /dev/null +++ b/tools/utils/output_utils.c @@ -0,0 +1,51 @@ +/* + * output_utils.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + + +#include +#include +#include +#include + +#include "mextr.h" +#include "htslib/bgzf.h" +#include "htslib/hfile.h" + +int calc_phred(double z) { + int phred; + if(z <= 0.0) phred = 255; + else { + phred = (int)(-10.0 * log(z) / LOG10); + if(phred > 255) phred = 255; + } + return phred; +} + +double *get_prob_dist(int ns, double *Q[]) { + // Build up prob. distribution Q(i) where Q(i) = prob that i samples have genotype CG/CG + double *p = Q[2]; + double *q0 = Q[0]; + double *q1 = Q[1]; + q0[0] = 1.0; + for(int ix = 0; ix < ns; ix++) { + double z = p[ix]; + q1[0] = q0[0] * (1.0 - z); + for(int k = 1; k <= ix; k++) q1[k] = q0[k - 1] * z + q0[k] * (1.0 - z); + q1[ix + 1] = q0[ix] * z; + double *t = q0; + q0 = q1; + q1 = t; + } + return q0; +} + +char trans_base[256] = { + ['A'] = 'T', ['C'] = 'G', ['G'] = 'C', ['T'] = 'A', + ['Y'] = 'R', ['R'] = 'Y', ['S'] = 'S', ['W'] = 'W', ['K'] = 'M', ['M'] = 'K', + ['B'] = 'V', ['V'] = 'B', ['D'] = 'H', ['H'] = 'D', ['N'] = 'N', ['.'] = '.' +}; + diff --git a/tools/utils/rec.c b/tools/utils/rec.c new file mode 100644 index 00000000..703769ec --- /dev/null +++ b/tools/utils/rec.c @@ -0,0 +1,56 @@ +/* + * rec.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "mextr.h" + +rec_t *rec_init(const int ns) { + assert(ns > 0); + rec_t *rec = calloc((size_t)1, sizeof(rec_t)); + rec->sample_gt = malloc(sizeof(gt_meth) * ns); + return rec; +} + +#define REC_BUF_SHIFT 64 + +void *handle_rec_buf(void *p) { + args_t * const args = p; + rec_buffer_t *rec_buf = &args->rec_buf; + rec_t * lbuf[REC_BUF_SIZE]; + const struct timespec wt = {0, 2500}; + int ix = 0; + while(!args->proc_finished) { + while(ix < REC_BUF_SIZE && ((rec_buf->buf[ix]->tasks & (REC_READY | RJ_ALL)) == REC_READY)) ix++; + if(ix >= REC_BUF_SHIFT) { + pthread_mutex_lock(&rec_buf->mut); + rec_t ** bp = rec_buf->buf; + for(int i = 0; i < ix; i++) { + lbuf[i] = bp[i]; + lbuf[i]->tasks = 0; + } + for(int i = 0; i < REC_BUF_SIZE - ix; i++, bp++) *bp = bp[ix]; + for(int i = 0; i < ix; i++) *bp++ = lbuf[i]; + rec_buf->first_index += ix; + pthread_mutex_unlock(&rec_buf->mut); + ix = 0; + } else nanosleep(&wt, NULL); + } + return NULL; +} + diff --git a/tools/utils/stats.c b/tools/utils/stats.c new file mode 100644 index 00000000..f99b5cfe --- /dev/null +++ b/tools/utils/stats.c @@ -0,0 +1,32 @@ +/* + * stats.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#include +#include +#include +#include + +#include "mextr.h" + +void init_stats(args_t *a) { + a->stats = calloc((size_t)1, sizeof(stats_t)); +} + +void write_stats(args_t *a) { + if(a->stats != NULL) { + a->reportfile = a->reportfilename == NULL ? NULL : fopen(a->reportfilename, "w"); + if(a->reportfile != NULL) { + FILE *fp = a->reportfile; + stats_t *st = a->stats; + fprintf(fp,"{\n\t\"TotalSites\": %" PRIu64 ",\n", st->n_sites); + fprintf(fp,"\t\"SitesPassed\": %" PRIu64 "\n", st->n_sites_pass); + fputs("}\n", fp); + fclose(fp); + } + } +} + diff --git a/tools/utils/unpack.c b/tools/utils/unpack.c new file mode 100644 index 00000000..0c90a7be --- /dev/null +++ b/tools/utils/unpack.c @@ -0,0 +1,276 @@ +/* + * unpack.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "mextr.h" + +#include "htslib/hfile.h" +#include "htslib/bgzf.h" + +static void process(bcf1_t * const rec, rec_t * const rc, char ** cxp, int32_t * cx_np, const bool silent, args_t * const args) +{ + static char *ftags[N_FTAGS] = {"FT", "MC8", "AMQ", "CX", "AQ", "MQ"}; + static int ftypes[N_FTAGS] = {BCF_HT_STR, BCF_HT_INT, BCF_HT_INT, BCF_HT_STR, BCF_HT_INT, BCF_HT_INT}; + + static int old_id = -1; + static int old_complete = 0; + + int ns = bcf_hdr_nsamples(args->hdr); + bcf_unpack(rec, BCF_UN_FLT); + int n_all = rec->n_allele; + bool cg = false; + for(int i = 0; i < n_all; i++) { + char c = rec->d.allele[i][0]; + if((c == 'C' || c == 'G') && rec->d.allele[i][1] == 0) { + cg = true; + break; + } + } + rc->tasks = args->job_mask | REC_SKIP; + if(cg) { // Site with potentially Cs or Gs + bcf_unpack(rec, BCF_UN_ALL); + // Get format tags + for(int ix = 0; ix < N_FTAGS; ix++) { + fmt_store_t *s = rc->tags + ix; + s->ne = bcf_get_format_values(args->hdr, rec, ftags[ix], &s->dat_p, &s->dat_n, ftypes[ix]); + } + if(rc->tags[FMT_CX].ne > 0 && rc->tags[FMT_MC8].ne == ns * 8) { + // Get sample base counts and genotype probs. + int32_t *mc8_p = rc->tags[FMT_MC8].dat_p; + int32_t *amq_p = rc->tags[FMT_AMQ].dat_p; + int n_amq = rc->tags[FMT_AMQ].ne / ns; + int32_t *aq_p = rc->tags[FMT_AQ].ne == ns ? rc->tags[FMT_AQ].dat_p : NULL; + int32_t *mq_p = rc->tags[FMT_MQ].ne == ns ? rc->tags[FMT_MQ].dat_p : NULL; + double ms_mq = 0.0; + int32_t tot_n = 0; + for(int i = 0; i < ns; i++) { + int32_t *ct = rc->sample_gt[i].counts; + int32_t *amq = rc->sample_gt[i].aqual; + memset(ct, 0, sizeof(int32_t) * 8); + memset(amq, 0, sizeof(int32_t) * 8); + int32_t x = mc8_p[i * 8]; + int k = 0; + if(x != bcf_int32_missing) { + int k1 = 0; + for(int j = 0; j < 8; j++) { + x = mc8_p[i * 8 + j]; + ct[j] += x; + k += x; + if(x > 0 && amq_p != NULL && k1 < n_amq) { + int q = amq_p[i * n_amq + k1++]; + if(q >= 0) { + if(q > MAX_QUAL) q = MAX_QUAL; + amq[j] = q; + } + } + } + if(amq_p == NULL) { + int q = aq_p == NULL ? args->bq_thresh : aq_p[i]; + if(q > MAX_QUAL) q = MAX_QUAL; + for(int j = 0; j < 8; j++) amq[j] = q; + } + } + if(k > 0) { + if(mq_p != NULL) { + int m = mq_p[i]; + ms_mq += (double)k * (double)(m * m); + } + tot_n += k; + calc_gt_prob(rc->sample_gt + i, args, rec->d.allele[0][0]); + rc->sample_gt[i].skip = false; + } else rc->sample_gt[i].skip = true; + } + // If we force a common genotype, calculate prob. distribution for common genotype + if(ns > 1) { + double gt[10]; + for(int k = 0; k < 10; k++) gt[k] = 0.0; + for(int i = 0; i < ns; i++) { + if(!rc->sample_gt[i].skip) { + for(int k = 0; k < 10; k++) gt[k] += rc->sample_gt[i].gt_prob[k]; + } + } + double max = gt[0]; + int max_gt = 0; + for(int k = 1; k < 10; k++) { + if(gt[k] > max) { + max = gt[k]; + max_gt = k; + } + } + double sum = 0.0; + for(int k = 0; k < 10; k++) sum += exp(gt[k] - max); + sum = log(sum); + for(int k = 0; k < 10; k++) gt[k] -= (max + sum); + if(args->common_gt) { + for(int i = 0; i < ns; i++) { + if(!rc->sample_gt[i].skip) { + for(int k = 0; k < 10; k++) rc->sample_gt[i].gt_prob[k] = gt[k]; + rc->sample_gt[i].max_gt = max_gt; + rc->sample_gt[i].sum = max + sum; + } + } + } + rc->max_common_gt = max_gt; + } else rc->max_common_gt = rc->sample_gt->max_gt; + + int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)cxp, cx_np, BCF_HT_STR); + rc->ref = (cx_len >= 3 ? (*cxp)[2] : '.'); + int i; + for(i = 0; i < cx_len && i < 5; i++) rc->cx[i] = (*cxp)[i]; + for(;i < 5; i++) rc->cx[i] = 'N'; + rc->pos = rec->pos; + rc->rid = rec->rid; + int id = args->id_trans[rc->rid]; + if(!silent) { + const bool tty = isatty(STDERR_FILENO); + int complete = 0; + if(tty) { + const uint64_t tot1 = rec->pos + 1 + (id ? args->cumul_len[id - 1] : 0); + const uint64_t tot2 = args->cumul_len[args->sr->regions->nseqs - 1]; + complete = (int)((tot1 * 1000) / tot2); + } + if(id > old_id || complete > old_complete) { + old_id = id; + old_complete = complete; + assert(id >= 0); + if(tty) { + fprintf(stderr,"Reading %s (%.1f%% completed) \r", args->sr->regions->seq_names[id], 0.1 * (double)complete); + } else { + fprintf(stderr,"Reading %s\n", args->sr->regions->seq_names[id], complete); + } + } + } + rc->tasks &= ~REC_SKIP; + } + } + rc->tasks |= REC_READY; +} + +#define BUF_SPACE(r, w) ( (w) >= (r) ? (r) + READ_BUF_SIZE - (w) : (r) - (w) ) +#define NUM_READY(r, w) ( (w) >= (r) ? (w) - (r) : (w) + READ_BUF_SIZE - (r) ) + +#define READ_BLK_SIZE 64 +#define UNPACK_BLK_SIZE 32 + +void *unpack_bcf_thread(void *p) +{ + gthr_info_t * const gi = p; + args_t * const args = gi->args; + const int thr_idx = gi->thread_idx; + bcf1_t *lbuf[UNPACK_BLK_SIZE]; + rec_t * lrbuf[UNPACK_BLK_SIZE]; + uint64_t idx[UNPACK_BLK_SIZE]; + for(int i = 0; i < UNPACK_BLK_SIZE; i++) lbuf[i] = bcf_init(); + bcf1_buffer_t * const rb = &args->read_buf; + rec_buffer_t * const rec_buf = &args->rec_buf; + volatile int * const wp = &rb->write_pos; + volatile int * const rp = &rb->read_pos; + const struct timespec wt = {0, 5000}; + char *cx = NULL; + int32_t cx_n = 0; + + while(true) { + pthread_mutex_lock(&rb->mut); + int k; + while((k = NUM_READY(*rp, *wp)) < UNPACK_BLK_SIZE && !args->input_finished) { + pthread_cond_wait(&rb->cond[1], &rb->mut); + } + if(!k) { + pthread_mutex_unlock(&rb->mut); + break; + } + if(k > UNPACK_BLK_SIZE) k = UNPACK_BLK_SIZE; + for(int i = 0; i < k; i++) { + bcf1_t *tmp = lbuf[i]; + lbuf[i] = rb->buf[*rp]; + idx[i] = rb->idx[*rp]; + rb->buf[*rp] = tmp; + *rp = ((*rp) + 1) % READ_BUF_SIZE; + } + pthread_mutex_unlock(&rb->mut); + pthread_cond_signal(&rb->cond[0]); + // Check if we have space to store in rec_buf + const uint64_t last_idx = idx[k - 1]; + pthread_mutex_lock(&rec_buf->mut); + while(last_idx - rec_buf->first_index >= REC_BUF_SIZE ) { + pthread_mutex_unlock(&rec_buf->mut); + nanosleep(&wt, NULL); + pthread_mutex_lock(&rec_buf->mut); + } + int ix = (int)(idx[0] - rec_buf->first_index); + for(int i = 0; i < k; i++) lrbuf[i] = rec_buf->buf[ix + i]; + pthread_mutex_unlock(&rec_buf->mut); + for(int i = 0; i < k; i++) { + process(lbuf[i], lrbuf[i], &cx, &cx_n, thr_idx > 0, args); + } + } + for(int i = 0; i < UNPACK_BLK_SIZE; i++) bcf_destroy(lbuf[i]); + if(!thr_idx) { + if(isatty(STDERR_FILENO)) fprintf(stderr,"Reading 100%% completed \n"); + else fprintf(stderr, "Input finished\n"); + } + return NULL; +} + +void *read_thread(void * const p) { + args_t * const args = p; + bcf_srs_t * const sr = args->sr; + bcf1_buffer_t * const rb = &args->read_buf; + volatile int * const rp = &rb->read_pos; + const struct timespec wt = {0, 5000}; + uint64_t count = 0; + for(int i = 0; i < READ_BUF_SIZE; i++) rb->buf[i] = bcf_init(); + while(!args->input_finished) { + int ix = rb->write_pos; + pthread_mutex_lock(&rb->mut); + while(BUF_SPACE(*rp, ix) < READ_BLK_SIZE) { + pthread_cond_wait(&rb->cond[0], &rb->mut); +// nanosleep(&wt, NULL); + } + int ix_end = (*rp); + pthread_mutex_unlock(&rb->mut); + int ix1 = (ix + 1) % READ_BUF_SIZE; + while(ix1 != ix_end) { + if(bcf_sr_next_line(sr)) { + rb->idx[ix] = count++; + bcf_sr_swap_line(sr, 0, rb->buf[ix]); + } else { + args->input_finished = true; + break; + } + rb->write_pos = ix = ix1; + ix1 = (ix + 1) % READ_BUF_SIZE; + } + pthread_cond_broadcast(&rb->cond[1]); + } + int ix = rb->write_pos; + pthread_mutex_lock(&rb->mut); + while((*rp) != ix) { + pthread_cond_broadcast(&rb->cond[1]); + pthread_cond_wait(&rb->cond[0], &rb->mut); +// pthread_mutex_unlock(&rb->mut); +// nanosleep(&wt, NULL); +// pthread_mutex_lock(&rb->mut); + } + pthread_mutex_unlock(&rb->mut); + for(int i = 0; i < READ_BUF_SIZE; i++) bcf_destroy(rb->buf[i]); + return NULL; +} + + From 99c591135235da2640afa6fde4642a0eb7f774b8 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 20 Jan 2020 10:53:17 +0100 Subject: [PATCH 27/61] Change gemBS to use the new mextr executable --- gemBS/__init__.py | 85 +++++--------------------------------- gemBS/parser.py | 2 +- gemBS/production.py | 36 ++++++++-------- gemBS/version.py | 4 +- setup.py | 12 ++---- tools/utils/command_line.c | 6 +-- tools/utils/mextr.c | 4 +- 7 files changed, 41 insertions(+), 108 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 1b4394bf..dfcaa8ae 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -88,11 +88,10 @@ def __getitem__(self, item): "gem-indexer": "gem-indexer", "gem-mapper": "gem-mapper", "bs_call": "bs_call", - "wigToBigWig": "wigToBigWig", - "bedToBigBed": "bedToBigBed", "dbSNP_idx": "dbSNP_idx", "gemBS_cat": "gemBS_cat", "md5_fasta": "md5_fasta", + "mextr": "mextr", "samtools": "samtools", "bcftools": "bcftools", "bgzip": "bgzip", @@ -1182,7 +1181,7 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=False,cpg=False,non_cpg=False,allow_het=False, inform=1,phred=20,min_nc=1,bedMethyl=False,bigWig=False,contig_list=None,contig_size_file=None, - snps=None,snp_list=None,snp_db=None,ref_bias=None): + snps=None,snp_list=None,snp_db=None,ref_bias=None,extract_threads=None): """ Filters bcf methylation calls file @@ -1203,9 +1202,9 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal for ctg, size in contig_list: f.write("{}\t0\t{}\n".format(ctg, size)) - bcftools = [executables['bcftools'],'view','-R',contig_bed,'-Ou',bcfFile] - mextr_com = [executables['bcftools'],'+mextr','--','-z'] - mextr = [] + mextr = [executables['mextr'], '-z', '--md5', '-R', contig_bed] + if extract_threads: + mextr.extend(['-@', extract_threads]) if ref_bias: mextr.extend(['--reference-bias', ref_bias]) if cpg: @@ -1214,27 +1213,19 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal mextr.extend(['--noncpgfile', outbase + '_non_cpg.txt', '--min-nc', str(min_nc)]) if bedMethyl: mextr.extend(['-b', outbase]) - if bigWig: - mextr.extend(['-w', '-']) - wig2bigwig = [executables['wigToBigWig'], '/dev/stdin', contig_size_file, outbase + '.bw'] if cpg or non_cpg: - mextr.extend(['--inform',str(inform),'--threshold',str(phred)]) + mextr.extend(['--inform',str(inform),'--threshold',str(phred),'--tabix']) if strand_specific: mextr.extend(['--mode', 'strand-specific']) if allow_het: mextr.extend(['--select', 'het']) - - if mextr: - mextr_com.extend(mextr) - pipeline = [bcftools, mextr_com] - if bigWig: - pipeline.append(wig2bigwig) - - logfile = os.path.join(output_dir,"mextr_{}.err".format(name)) - process = run_tools(pipeline, name="Methylation Extraction", logfile=logfile) + mextr.append(bcfFile); + logfile = os.path.join(output_dir,"mextr_{}.err".format(name)) + process = run_tools([mextr], name="Methylation Extraction", logfile=logfile) if snps: + bcftools = [executables['bcftools'],'view','-R',contig_bed,'-Ou',bcfFile] snpxtr = [executables['bcftools'],'+snpxtr','--','-z','-o',outbase + '_snps.txt.gz'] if snp_list: snpxtr.extend(['-s',snp_list]) @@ -1251,15 +1242,6 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal os.remove(contig_bed) - # Now generate indexes and bigBed files if required - if cpg: - tfile = "{}_cpg.txt.gz.tbi".format(outbase) - if os.path.exists(tfile): - os.remove(tfile) - logfile = os.path.join(output_dir,"tabix_{}_cpg.err".format(name)) - tabix = [executables['tabix'], '-p', 'bed', '-S', '1', "{}_cpg.txt.gz".format(outbase)] - cpg_idx_proc = run_tools([tabix],name="Index Methylation CpG files", logfile=logfile) - if snps: tfile = "{}_snp.txt.gz.tbi".format(outbase) if os.path.exists(tfile): @@ -1268,53 +1250,6 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal tabix = [executables['tabix'], '-S', '1', '-s' '1', '-b', '2', '-e', '2', "{}_snps.txt.gz".format(outbase)] snp_idx_proc = run_tools([tabix],name="Index SNP files", logfile=logfile) - if non_cpg: - tfile = "{}_non_cpg.txt.gz.tbi".format(outbase) - if os.path.exists(tfile): - os.remove(tfile) - logfile = os.path.join(output_dir,"tabix_{}_non_cpg.err".format(name)) - tabix = [executables['tabix'], '-p', 'bed', '-S', '1', "{}_non_cpg.txt.gz".format(outbase)] - non_cpg_idx_proc = run_tools([tabix],name="Index Methylation non-CpG files", logfile=logfile) - - if bedMethyl: - if pkg_resources.resource_exists("gemBS", "etc"): - etc_dir = pkg_resources.resource_filename("gemBS", "etc") - else: - raise CommandException("Couldn't locate gemBS etc directory") - - bm_proc = [] - bm_tfile = [] - for x in ('cpg', 'chg', 'chh'): - bfile = "{}_{}.bed.gz".format(outbase, x) - unzip = [executables['bgzip'], '-cd', bfile] - - sed = ['sed', '1d'] - ofile = "{}_{}.bed.tmp".format(outbase,x) - p = run_tools([unzip, sed],name='Make temp bed9+5 file',output=ofile) - bm_proc.append(p) - bm_tfile.append(ofile) - for p in bm_proc: - if p.wait() != 0: - raise ValueError("Error while uncompressing bed9+5 files.") - bm_proc = [] - for ix, x in enumerate(['cpg', 'chg', 'chh']): - bed2bb = [executables['bedToBigBed'], '-type=bed9+5',"-as={}/bed9_5.as".format(etc_dir), '-tab', bm_tfile[ix], - contig_size_file, "{}_{}.bb".format(outbase,x)] - logfile = os.path.join(output_dir,"bedToBigWig_{}_{}.err".format(name,x)) - p = run_tools([bed2bb], name='Make bigBed file',logfile=logfile) - bm_proc.append(p) - - if cpg: - if cpg_idx_proc.wait() != 0: - raise ValueError("Error while indexing CpG calls.") - if non_cpg: - if non_cpg_idx_proc.wait() != 0: - raise ValueError("Error while indexing non-CpG calls.") - if bedMethyl: - for ix, p in enumerate(bm_proc): - if p.wait() != 0: - raise ValueError("Error while making bigBed files.") - os.remove(bm_tfile[ix]) if snps: if snp_idx_proc.wait() != 0: raise ValueError("Error while indexing SNP calls.") diff --git a/gemBS/parser.py b/gemBS/parser.py index c9399dfa..55b9cdcc 100755 --- a/gemBS/parser.py +++ b/gemBS/parser.py @@ -142,7 +142,7 @@ def read(self, infile): 'keep_duplicates', 'keep_improper_pairs', 'call_threads', 'merge_threads', 'remove_individual_bcfs', 'haploid', 'reference_bias', 'conversion', 'contig_list', 'contig_pool_limit', 'benchmark_mode'), 'extract': ('extract_dir', 'jobs', 'allow_het', 'phred_threshold', 'min_inform', 'strand_specific', 'min_bc', 'make_cpg', 'make_non_cpg', - 'make_bedmethyl', 'make_bigwig', 'make_snps', 'snp_list', 'snp_db', 'reference_bias'), + 'make_bedmethyl', 'make_bigwig', 'make_snps', 'snp_list', 'snp_db', 'reference_bias', 'threads', 'extract_threads'), 'report': ('project', 'report_dir', 'threads') } # Check if variables are used diff --git a/gemBS/production.py b/gemBS/production.py index ba4651f3..cb4aa308 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -1321,11 +1321,11 @@ class MethylationFiltering(BasicPipeline): A second set of extracted outputs that correspond to the ENCODE WGBS pipeline are also available using the --bed-methyl and --bigwig options. The --bed-methyl option will produce three files per sample for all covered sites in CpG, CHG and CHH context in BED9+5 format. Each of - the files will also be generated in bigBed format for display in genome browsers. The --bigwig option will produce a bigWig file giving - the methylation percentage at all covered cytosine sites (informative coverage > 0). For the ENCODE output files, not further filtering - is performed. + the files will also be generated in bigBed format for display in genome browsers. In addition a bigWig format file will be generated giving + the methylation percentage at all covered cytosine sites (informative coverage > 0). If the --strand-specific option is given then two bigWig + files will be geenrated - one for each strand. For the ENCODE output files, not further filtering is performed. - In addition to the methylation result, SNP genotypes can also be extract with the --snps options. By default, this will return a file + In addition to the methylation result, SNP genotypes can also be extracted with the --snps options. By default, this will return a file with genotypes on all SNPs covered by the experiment that were in the dbSNP_idx file used for the calling stage. This selection can be refined uwing the --snp-list option, which is a file with a list of SNP ids, one id per line. An alternate dbSNP_idx file can also be supplied using the --snp-db option, allowing SNPs that were not in the original dbSNP_idx file used for calling to be extracted. @@ -1344,7 +1344,7 @@ def register(self,parser): parser.add_argument('-j','--jobs', dest="jobs", type=int, help='Number of parallel jobs') parser.add_argument('-n','--sample-name',dest="sample_name",metavar="SAMPLE_NAME",help="Name of sample to be filtered") parser.add_argument('-b','--barcode',dest="sample",metavar="SAMPLE_BARCODE",help="Barcode of sample to be filtered") - parser.add_argument('-s','--strand-specific', dest="strand_specific", action="store_true", default=False, help="Output separate lines for each strand.") + parser.add_argument('-s','--strand-specific', dest="strand_specific", action="store_true", default=False, help="Output separate lines in CpG file and/or bigWig file for each strand.") parser.add_argument('-q','--phred-threshold', dest="phred", help="Min threshold for genotype phred score.") parser.add_argument('-I','--min-inform', dest="inform", help="Min threshold for informative reads.") parser.add_argument('-M','--min-nc', dest="min_nc", help="Min threshold for non-converted reads for non CpG sites.") @@ -1353,8 +1353,8 @@ def register(self,parser): parser.add_argument('-c','--cpg', dest="cpg", action="store_true", help="Output gemBS bed with cpg sites.") parser.add_argument('-N','--non-cpg', dest="non_cpg", action="store_true", help="Output gemBS bed with non-cpg sites.") parser.add_argument('-B','--bed-methyl', dest="bedMethyl", action="store_true", help="Output bedMethyl files (bed and bigBed)") - parser.add_argument('-w','--bigwig', dest="bigWig", action="store_true", help="Output bigWig file") parser.add_argument('-S','--snps', dest="snps", action="store_true",help="Output SNPs") + parser.add_argument('--extract-threads', dest="extract_threads", metavar="THREADS", help='Number of extra threads for extract step') parser.add_argument('--snp-list', dest="snp_list", help="List of SNPs to output") parser.add_argument('--snp-db', dest="snp_db", help="dbSNP_idx processed SNP idx") parser.add_argument('--dry-run', dest="dry_run", action="store_true", help="Output mapping commands without execution") @@ -1368,6 +1368,8 @@ def run(self,args): # JSON data self.jsonData = JSONdata(Mapping.gemBS_json) + self.threads = self.jsonData.check(section='extract',key='threads') + self.extract_threads = self.jsonData.check(section='extract',key='extract_threads',arg=args.extract_threads,default=self.threads) self.jobs = self.jsonData.check(section='extract',key='jobs',arg=args.jobs,default=1,int_type=True) self.allow_het = self.jsonData.check(section='extract',key='allow_het',arg=args.allow_het,boolean=True,default=False) self.cpg = self.jsonData.check(section='extract',key='make_cpg',arg=args.cpg,boolean=True,default=False) @@ -1377,7 +1379,7 @@ def run(self,args): self.non_cpg = self.jsonData.check(section='extract',key='make_non_cpg',arg=args.non_cpg,boolean=True,default=False) self.bedMethyl = self.jsonData.check(section='extract',key='make_bedmethyl',arg=args.bedMethyl,boolean=True,default=False) self.ref_bias = self.jsonData.check(section='extract',key='reference_bias',arg=args.ref_bias) - self.bigWig = self.jsonData.check(section='extract',key='make_bigwig',arg=args.bigWig,boolean=True,default=False) +# self.bigWig = self.jsonData.check(section='extract',key='make_bigwig',arg=args.bigWig,boolean=True,default=False) self.strand_specific = self.jsonData.check(section='extract',key='strand_specific',arg=args.strand_specific,boolean=True,default=False) self.phred = self.jsonData.check(section='extract',key='phred_threshold',arg=args.phred, default = '20') self.inform = self.jsonData.check(section='extract',key='min_inform',arg=args.inform, default = 1, int_type=True) @@ -1406,7 +1408,7 @@ def run(self,args): if self.cpg: self.mask |= 3 if self.non_cpg: self.mask |= 12 if self.bedMethyl: self.mask |= 48 - if self.bigWig: self.mask |= 192 +# if self.bigWig: self.mask |= 192 if self.snps: self.mask |= 768 self.mask1 = self.mask & 341 @@ -1506,18 +1508,17 @@ def do_filter(self, v): cpg, non_cpg, bigWig, bedMethyl, snps = (False, False, False, False, False) if self.cpg and not (sm & 3): cpg = True - files.extend([filebase + '_cpg.txt.gz', filebase + '_cpg.txt.gz.tbi']) + files.extend([filebase + '_cpg.txt.gz', filebase + '_cpg.txt.gz.tbi', filebase + '_cpg.txt.gz.md5']) if self.non_cpg and not (sm & 12): non_cpg = True - files.extend([filebase + '_non_cpg.txt.gz', filebase + '_non_cpg.txt.gz.tbi']) - if self.bigWig and not (sm & 48): - bigWig = True - files.append(filebase + '.bw') + files.extend([filebase + '_non_cpg.txt.gz', filebase + '_non_cpg.txt.gz.tbi', filebase + '_non_cpg.txt.gz.md5']) if self.bedMethyl and not (sm & 192): bedMethyl = True for x in ('cpg', 'chg', 'chh') : - files.extend([filebase + "_{}.bed.gz".format(x), filebase + "_{}.bed.tmp".format(x), - filebase + "_{}.bb".format(x)]) + files.extend([filebase + "_{}.bed.gz".format(x), filebase + "_{}.bed.gz.md5".format(x), + filebase + "_{}.bb".format(x), filebase + "_{}.bb.md5".format(x)]) + files.extend([filebase + '.bw', filebase + '.bw.md5']) + if self.snps and not(sm & 768): snps = True files.extend([filebase + '_snps.txt.gz', filebase + '_snps.txt.gz_tbi']) @@ -1538,9 +1539,10 @@ def do_filter(self, v): if args.min_nc: com.extend(['-M', args.min_nc]) if args.ref_bias: com.extend(['--reference-bias', args.ref_bias]) if args.allow_het: com.extend(['-H', args.allow_het]) + if args.extract_threads: com.extend(['-@', args.extract_threads]) if cpg: com.append('--cpg') if non_cpg: com.append('--non-cpg') - if bigWig: com.append('--bigwig') +# if bigWig: com.append('--bigwig') if bedMethyl: com.append('--bed-methyl') if snps: com.append('--snps') @@ -1569,7 +1571,7 @@ def do_filter(self, v): cpg=cpg,non_cpg=non_cpg,contig_list=self.contig_list,allow_het=self.allow_het, inform=self.inform,phred=self.phred,min_nc=self.min_nc,bedMethyl=bedMethyl, bigWig=bigWig,contig_size_file=self.contig_size_file,ref_bias=self.ref_bias, - snps=snps,snp_list=self.snp_list,snp_db=self.snp_db) + snps=snps,snp_list=self.snp_list,snp_db=self.snp_db,extract_threads=self.extract_threads) if ret: logging.gemBS.gt("Results extraction for {} done, results located in: {}".format(bcf_file, ret)) diff --git a/gemBS/version.py b/gemBS/version.py index 306c9ba5..1fbbc838 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" -__VERSION_MINOR = "4" -__VERSION_SUBMINOR = "4" +__VERSION_MINOR = "5" +__VERSION_SUBMINOR = "0" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/setup.py b/setup.py index d48e0c4b..f1f65e02 100644 --- a/setup.py +++ b/setup.py @@ -64,9 +64,7 @@ def _install_bundle(install_dir, inst): os.mkdir(gemBSbin_dir) # copy tools/bin - bins = ['gemBS_cat', 'readNameClean', 'md5_fasta'] - if not (inst.minimal or inst.no_kent): - bins.extend(['wigToBigWig', 'bedToBigBed']) + bins = ['gemBS_cat', 'readNameClean', 'md5_fasta', 'mextr'] for file in bins: f = os.path.join('tools/bin', file) if os.path.exists(f): @@ -154,20 +152,18 @@ def _install_bundle(install_dir, inst): class install(_install): _install.user_options.extend([ ('no-samtools', None, "Do not install samtools"), - ('no-kent', None, "Do not install kent tools (bedToBigBed, wigToBigWig)"), ('no-gem3', None, "Do not install gem3 mapper"), ('no-bscall', None, "Do not install bscall"), ('minimal', None, - "Perform minimal install (equivalent to --no-samtools --no-kent --no-gem3 --no-bscall)"), + "Perform minimal install (equivalent to --no-samtools --no-gem3 --no-bscall)"), ('disable-cuda', None, "Do not build GPU support for GEM3 (default)"), ('enable-cuda', None, "Try to build GPU support for GEM3"), ]) - _install.boolean_options.extend(['no-samtools','no-kent','no-gem3','no-bscall','minimal']) + _install.boolean_options.extend(['no-samtools','no-gem3','no-bscall','minimal']) def initialize_options(self): self.minimal = False self.no_samtools = False - self.no_kent = False self.no_gem3 = False self.no_bscall = False self.disable_cuda = False @@ -179,8 +175,6 @@ def run(self): if not self.minimal: if not self.no_samtools: options.append('_samtools') - if not self.no_kent: - options.append('kent') if not self.no_gem3: options.append('gem3') if not self.no_bscall: diff --git a/tools/utils/command_line.c b/tools/utils/command_line.c index 3e42bc1e..d19f3abd 100644 --- a/tools/utils/command_line.c +++ b/tools/utils/command_line.c @@ -45,9 +45,9 @@ const char *usage(void) { " -b, --bed-methyl Output file base for bedMethly files. Not compatible with multi-sample files (default, not output)\n" " -t, --bed-track-line Track line for for bedMethly files (default, info taken from input VCF file)\n" " -S, --report-file Output file for JSON report (default, not output)\n" - " -r, --regions restrict to comma separated list of regions" - " -R, --regions-file restrict to regions listed in file" - " -@, --threads Extra threads" + " -r, --regions restrict to comma separated list of regions\n" + " -R, --regions-file restrict to regions listed in file\n" + " -@, --threads Extra threads\n" " -H, --no_header Do not print header line(s) in output file(s) (default, false)\n" " -g, --common-gt Recall genotypes assuming common genotypes across samples\n" " -m, --mode Output mode for CpG sites\n" diff --git a/tools/utils/mextr.c b/tools/utils/mextr.c index 3173a99d..a179392d 100644 --- a/tools/utils/mextr.c +++ b/tools/utils/mextr.c @@ -68,9 +68,11 @@ int main(int argc, char **argv) { fill_base_prob_table(); pthread_t read_thr; pthread_create(&read_thr, NULL, read_thread, &args); + // If we are generating bigBed and bigWig files, we will take half of the threads for the compression stage + if(args.bedmethyl && args.threads > 1) args.threads = (args.threads + 1) >> 1; int nt = 1; // If the user has asked for more threads we will take one extra thread for the bcf unpacking - more than this is rarely useful - if(args.threads > 1) { + if(args.threads > 4) { nt++; args.threads--; } From f564fe0cac18847f8d5c79f646c5c140197e9a89 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 20 Jan 2020 11:29:01 +0100 Subject: [PATCH 28/61] Add hooks for bigWig strand specific mode --- gemBS/__init__.py | 4 +++- gemBS/etc/gemBS_configs/IHEC_standard.conf | 2 +- gemBS/parser.py | 2 +- gemBS/production.py | 17 ++++++++++++----- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index dfcaa8ae..14aa1bec 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -1179,7 +1179,7 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No return " ".join(list(sample_bam.keys())) -def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=False,cpg=False,non_cpg=False,allow_het=False, +def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=False,bw_strand_specific=False,cpg=False,non_cpg=False,allow_het=False, inform=1,phred=20,min_nc=1,bedMethyl=False,bigWig=False,contig_list=None,contig_size_file=None, snps=None,snp_list=None,snp_db=None,ref_bias=None,extract_threads=None): @@ -1218,6 +1218,8 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal mextr.extend(['--inform',str(inform),'--threshold',str(phred),'--tabix']) if strand_specific: mextr.extend(['--mode', 'strand-specific']) + if bw_strand_specific: + mextr.extend(['--bw-mode', 'strand-specific']) if allow_het: mextr.extend(['--select', 'het']) mextr.append(bcfFile); diff --git a/gemBS/etc/gemBS_configs/IHEC_standard.conf b/gemBS/etc/gemBS_configs/IHEC_standard.conf index baa976ab..e98fc323 100644 --- a/gemBS/etc/gemBS_configs/IHEC_standard.conf +++ b/gemBS/etc/gemBS_configs/IHEC_standard.conf @@ -26,8 +26,8 @@ contig_pool_limit = 25000000 [extract] strand_specific = True +bigWig_strand_specific = True phred_threshold = 10 make_cpg = True make_non_cpg = True make_bedmethyl = True -make_bigwig = True diff --git a/gemBS/parser.py b/gemBS/parser.py index 55b9cdcc..ae9c0301 100755 --- a/gemBS/parser.py +++ b/gemBS/parser.py @@ -142,7 +142,7 @@ def read(self, infile): 'keep_duplicates', 'keep_improper_pairs', 'call_threads', 'merge_threads', 'remove_individual_bcfs', 'haploid', 'reference_bias', 'conversion', 'contig_list', 'contig_pool_limit', 'benchmark_mode'), 'extract': ('extract_dir', 'jobs', 'allow_het', 'phred_threshold', 'min_inform', 'strand_specific', 'min_bc', 'make_cpg', 'make_non_cpg', - 'make_bedmethyl', 'make_bigwig', 'make_snps', 'snp_list', 'snp_db', 'reference_bias', 'threads', 'extract_threads'), + 'make_bedmethyl', 'bigwig_strand_specific', 'make_bigwig', 'make_snps', 'snp_list', 'snp_db', 'reference_bias', 'threads', 'extract_threads'), 'report': ('project', 'report_dir', 'threads') } # Check if variables are used diff --git a/gemBS/production.py b/gemBS/production.py index cb4aa308..cbce5e62 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -1344,7 +1344,8 @@ def register(self,parser): parser.add_argument('-j','--jobs', dest="jobs", type=int, help='Number of parallel jobs') parser.add_argument('-n','--sample-name',dest="sample_name",metavar="SAMPLE_NAME",help="Name of sample to be filtered") parser.add_argument('-b','--barcode',dest="sample",metavar="SAMPLE_BARCODE",help="Barcode of sample to be filtered") - parser.add_argument('-s','--strand-specific', dest="strand_specific", action="store_true", default=False, help="Output separate lines in CpG file and/or bigWig file for each strand.") + parser.add_argument('-s','--strand-specific', dest="strand_specific", action="store_true", default=False, help="Output separate lines in CpG file for each strand.") + parser.add_argument('-W','--bigwig-strand-specific', dest="bw_strand_specific", action="store_true", default=False, help="Output separate bigWig files for each strand.") parser.add_argument('-q','--phred-threshold', dest="phred", help="Min threshold for genotype phred score.") parser.add_argument('-I','--min-inform', dest="inform", help="Min threshold for informative reads.") parser.add_argument('-M','--min-nc', dest="min_nc", help="Min threshold for non-converted reads for non CpG sites.") @@ -1381,6 +1382,7 @@ def run(self,args): self.ref_bias = self.jsonData.check(section='extract',key='reference_bias',arg=args.ref_bias) # self.bigWig = self.jsonData.check(section='extract',key='make_bigwig',arg=args.bigWig,boolean=True,default=False) self.strand_specific = self.jsonData.check(section='extract',key='strand_specific',arg=args.strand_specific,boolean=True,default=False) + self.bw_strand_specific = self.jsonData.check(section='extract',key='bigwig_strand_specific',arg=args.bw_strand_specific,boolean=True,default=False) self.phred = self.jsonData.check(section='extract',key='phred_threshold',arg=args.phred, default = '20') self.inform = self.jsonData.check(section='extract',key='min_inform',arg=args.inform, default = 1, int_type=True) self.min_nc = self.jsonData.check(section='extract',key='min_nc',arg=args.inform, default = 1, int_type=True) @@ -1514,10 +1516,14 @@ def do_filter(self, v): files.extend([filebase + '_non_cpg.txt.gz', filebase + '_non_cpg.txt.gz.tbi', filebase + '_non_cpg.txt.gz.md5']) if self.bedMethyl and not (sm & 192): bedMethyl = True - for x in ('cpg', 'chg', 'chh') : + for x in ('cpg', 'chg', 'chh'): files.extend([filebase + "_{}.bed.gz".format(x), filebase + "_{}.bed.gz.md5".format(x), filebase + "_{}.bb".format(x), filebase + "_{}.bb.md5".format(x)]) - files.extend([filebase + '.bw', filebase + '.bw.md5']) + if self.bw_strand_specific: + for x in ('pos', 'neg'): + files.extend([filebase + '_{}.bw'.format(x), filebase + '_{}.bw.md5'.format(x)]) + else: + files.extend([filebase + '.bw', filebase + '.bw.md5']) if self.snps and not(sm & 768): snps = True @@ -1533,7 +1539,8 @@ def do_filter(self, v): if Mapping.gemBS_json != '.gemBS/gemBS.json': com.extend(['-j',Mapping.gemBS_json]) com.extend(['extract','-b',sample]) - if args.strand_specific: com.append('-x') + if args.strand_specific: com.append('-s') + if args.bw_strand_specific: com.append('-W') if args.phred: com.extend(['-q', args.phred]) if args.inform: com.extend(['-I', args.inform]) if args.min_nc: com.extend(['-M', args.min_nc]) @@ -1567,7 +1574,7 @@ def do_filter(self, v): database.reg_db_com(filebase, "UPDATE extract SET status = 0 WHERE filepath = '{}'".format(filebase), files) #Call methylation extract - ret = methylationFiltering(bcfFile=bcf_file,outbase=filebase,name=sample,strand_specific=self.strand_specific, + ret = methylationFiltering(bcfFile=bcf_file,outbase=filebase,name=sample,strand_specific=self.strand_specific,bw_strand_specific=self.bw_strand_specific, cpg=cpg,non_cpg=non_cpg,contig_list=self.contig_list,allow_het=self.allow_het, inform=self.inform,phred=self.phred,min_nc=self.min_nc,bedMethyl=bedMethyl, bigWig=bigWig,contig_size_file=self.contig_size_file,ref_bias=self.ref_bias, From b33c43fc40c110b622cfac22f55832ccecf90337 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 24 Jan 2020 20:55:44 +0100 Subject: [PATCH 29/61] rewrite snpxtr as standalone tool rather than a bcftools plugin --- tools/utils/Makefile.in | 45 ++- tools/utils/common/dbSNP.c | 349 +++++++++++++++++++++++ tools/utils/common/dbSNP.h | 45 +++ tools/utils/common/uthash.h | 1 + tools/utils/{ => common}/utils.c | 0 tools/utils/{ => common}/utils.h | 4 +- tools/utils/{ => mextr}/bbi.c | 0 tools/utils/{ => mextr}/bbi.h | 0 tools/utils/{ => mextr}/bbi_defs.h | 0 tools/utils/{ => mextr}/bbi_structs.h | 0 tools/utils/{ => mextr}/calc_gt_prob.c | 0 tools/utils/{ => mextr}/command_line.c | 2 +- tools/utils/{ => mextr}/files.c | 0 tools/utils/{ => mextr}/init_params.c | 0 tools/utils/{ => mextr}/mextr.c | 0 tools/utils/{ => mextr}/mextr.h | 0 tools/utils/{ => mextr}/output.c | 0 tools/utils/{ => mextr}/output_headers.c | 0 tools/utils/{ => mextr}/output_utils.c | 0 tools/utils/{ => mextr}/rec.c | 0 tools/utils/{ => mextr}/stats.c | 0 tools/utils/{ => mextr}/unpack.c | 0 tools/utils/readNameClean.c | 139 --------- tools/utils/snpxtr/command_line.c | 162 +++++++++++ tools/utils/snpxtr/files.c | 111 +++++++ tools/utils/snpxtr/init_params.c | 25 ++ tools/utils/snpxtr/process.c | 197 +++++++++++++ tools/utils/snpxtr/snplist.c | 63 ++++ tools/utils/snpxtr/snpxtr.c | 44 +++ tools/utils/snpxtr/snpxtr.h | 54 ++++ tools/utils/uthash.h | 1 - 31 files changed, 1086 insertions(+), 156 deletions(-) create mode 100644 tools/utils/common/dbSNP.c create mode 100644 tools/utils/common/dbSNP.h create mode 120000 tools/utils/common/uthash.h rename tools/utils/{ => common}/utils.c (100%) rename tools/utils/{ => common}/utils.h (99%) rename tools/utils/{ => mextr}/bbi.c (100%) rename tools/utils/{ => mextr}/bbi.h (100%) rename tools/utils/{ => mextr}/bbi_defs.h (100%) rename tools/utils/{ => mextr}/bbi_structs.h (100%) rename tools/utils/{ => mextr}/calc_gt_prob.c (100%) rename tools/utils/{ => mextr}/command_line.c (99%) rename tools/utils/{ => mextr}/files.c (100%) rename tools/utils/{ => mextr}/init_params.c (100%) rename tools/utils/{ => mextr}/mextr.c (100%) rename tools/utils/{ => mextr}/mextr.h (100%) rename tools/utils/{ => mextr}/output.c (100%) rename tools/utils/{ => mextr}/output_headers.c (100%) rename tools/utils/{ => mextr}/output_utils.c (100%) rename tools/utils/{ => mextr}/rec.c (100%) rename tools/utils/{ => mextr}/stats.c (100%) rename tools/utils/{ => mextr}/unpack.c (100%) delete mode 100644 tools/utils/readNameClean.c create mode 100644 tools/utils/snpxtr/command_line.c create mode 100644 tools/utils/snpxtr/files.c create mode 100644 tools/utils/snpxtr/init_params.c create mode 100644 tools/utils/snpxtr/process.c create mode 100644 tools/utils/snpxtr/snplist.c create mode 100644 tools/utils/snpxtr/snpxtr.c create mode 100644 tools/utils/snpxtr/snpxtr.h delete mode 120000 tools/utils/uthash.h diff --git a/tools/utils/Makefile.in b/tools/utils/Makefile.in index 349ab8a9..425c86b2 100644 --- a/tools/utils/Makefile.in +++ b/tools/utils/Makefile.in @@ -13,16 +13,35 @@ CC=gcc ROOT_PATH=.. -TOOLS=gemBS_cat readNameClean md5_fasta mextr +TOOLS=gemBS_cat readNameClean md5_fasta mextr snpxtr FOLDER_BIN=../bin TOOLS_BIN=$(addprefix $(FOLDER_BIN)/, $(TOOLS)) LIBS:= -lm +GENERAL_FLAGS = -I. -Icommon -MEXTR_INC = @HTSINC@ +MEXTR_INC = @HTSINC@ -Imextr MEXTR_LIBS = @HTSLIBS@ -lz -lbz2 -lpthread $(LIBS) -MEXTR_SRC=mextr.c calc_gt_prob.c output.c output_utils.c output_headers.c command_line.c init_params.c files.c \ +MEXTR=mextr.c calc_gt_prob.c output.c output_utils.c output_headers.c command_line.c init_params.c files.c \ stats.c unpack.c rec.c bbi.c +MEXTR_COMMON=utils.c +MEXTR_DEPS=mextr.h bbi.h bbi_defs.h bbi_structs.h +MEXTR_COMMON_DEPS=utils.h + +MEXTR_SRC=$(addprefix mextr/, $(MEXTR)) $(addprefix common/, $(MEXTR_COMMON)) +MEXTR_DEP=$(addprefix mextr/, $(MEXTR_DEPS)) $(addprefix common/, $(MEXTR_COMMON_DEPS)) + +SNPXTR_INC = @HTSINC@ -Isnpxtr +SNPXTR_LIBS = @HTSLIBS@ -lz -lbz2 -lpthread $(LIBS) + +SNPXTR=snpxtr.c init_params.c command_line.c snplist.c process.c files.c +SNPXTR_COMMON=utils.c dbSNP.c + +SNPXTR_DEPS=snpxtr.h +SNPXTR_COMMON_DEPS=utils.h dbSNP.h + +SNPXTR_SRC=$(addprefix snpxtr/, $(SNPXTR)) $(addprefix common/, $(SNPXTR_COMMON)) +SNPXTR_DEP=$(addprefix snpxtr/, $(SNPXTR_DEPS)) $(addprefix common/, $(SNPXTR_COMMON_DEPS)) all: TOOLS_FLAGS=-O3 -g $(GENERAL_FLAGS) $(ARCH_FLAGS) $(SUPPRESS_CHECKS) $(OPTIMIZTION_FLAGS) $(ARCH_FLAGS_OPTIMIZTION_FLAGS) all: $(TOOLS_BIN) @@ -39,18 +58,18 @@ clean: distclean: clean rm -f Makefile config.status -utils.o: utils.c utils.h - $(CC) $(TOOLS_FLAGS) -c utils.c +$(FOLDER_BIN)/gemBS_cat: gemBS_cat.c common/utils.c + $(CC) $(TOOLS_FLAGS) -o $@ gemBS_cat.c common/utils.c $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) -$(FOLDER_BIN)/gemBS_cat: gemBS_cat.c utils.o - $(CC) $(TOOLS_FLAGS) -o $@ gemBS_cat.c utils.o $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) +$(FOLDER_BIN)/md5_fasta: md5_fasta.c common/utils.c + $(CC) $(TOOLS_FLAGS) -o $@ md5_fasta.c common/utils.c $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS)-lcrypto -$(FOLDER_BIN)/md5_fasta: md5_fasta.c utils.o - $(CC) $(TOOLS_FLAGS) -o $@ md5_fasta.c utils.o $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS)-lcrypto +$(FOLDER_BIN)/readNameClean: readNameClean/readNameClean.c common/utils.c + $(CC) $(TOOLS_FLAGS) -o $@ readNameClean/readNameClean.c common/utils.c $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) -$(FOLDER_BIN)/readNameClean: readNameClean.c utils.o - $(CC) $(TOOLS_FLAGS) -o $@ readNameClean.c utils.o $(LIB_PATH_FLAGS) $(INCLUDE_FLAGS) $(LIBS) $(EXTRA_LIBS) +$(FOLDER_BIN)/mextr: $(MEXTR_SRC) $(MEXTR_DEP) + $(CC) $(TOOLS_FLAGS) -o $@ $(MEXTR_SRC) $(MEXTR_INC) $(MEXTR_LIBS) -$(FOLDER_BIN)/mextr: $(MEXTR_SRC) mextr.h bbi.h bbi_defs.h bbi_structs.h utils.o - $(CC) $(TOOLS_FLAGS) -o $@ $(MEXTR_SRC) utils.o $(MEXTR_INC) $(MEXTR_LIBS) +$(FOLDER_BIN)/snpxtr: $(SNPXTR_SRC) $(SNPXTR_DEP) + $(CC) $(TOOLS_FLAGS) -o $@ $(SNPXTR_SRC) $(SNPXTR_INC) $(SNPXTR_LIBS) diff --git a/tools/utils/common/dbSNP.c b/tools/utils/common/dbSNP.c new file mode 100644 index 00000000..7557fd36 --- /dev/null +++ b/tools/utils/common/dbSNP.c @@ -0,0 +1,349 @@ +/* + * load_dbSNP.c + * + * Created on: Nov 24, 2019 + * Author: heath + */ + +#include +#include +#include +#include +#include + +#include "uthash.h" +#include "dbSNP.h" + +static void store_dbsnp_entries(dbsnp_bin_t *bin, int n_entries, int name_buf_sz, uint16_t *entries, uint8_t *name_buf) { + bin->entries = malloc(sizeof(uint16_t) * n_entries); + bin->name_buf = malloc((size_t)name_buf_sz); + bin->n_entries = n_entries; + uint64_t msk = (uint64_t)0; + for(int i = 0; i < n_entries; i++) { + bin->entries[i] = entries[i]; + msk |= ((uint64_t)1 << (entries[i] & 63)); + } + bin->mask = msk; + memcpy(bin->name_buf, name_buf, name_buf_sz); +} + +dbsnp_header_t *load_dbSNP_header(char * const fname) { + bool ok = true; + dbsnp_header_t *hdr = NULL; + FILE * const file = fopen(fname, "rb"); + if(!file) { + fprintf(stderr, "Could not open file %s for input: %s\n", fname, strerror(errno)); + return NULL; + } + fprintf(stderr,"Loading dbSNP header from %s\n", fname); + uint32_t td[2]; + size_t sz = fread(td, sizeof(uint32_t), 2, file); + if(sz != 2 || td[0] != 0xd7278434) { + fprintf(stderr, "Invalid format\n"); + return NULL; + } + void *ucomp_buf = NULL, *comp_buf = NULL; + uint32_t n_ctgs = 0; + uint64_t td1[3]; + dbsnp_ctg_t * ctgs = NULL; + sz = fread(td1, sizeof(uint64_t), 3, file); + if(sz != 3) ok = false; + else { + hdr = calloc(1, sizeof(dbsnp_header_t)); + hdr->fp = file; + hdr->filename = fname; + hdr->dbSNP_bufsize = td1[1]; + ucomp_buf = malloc(td1[1]); + comp_buf = malloc(td1[2]); + sz = fread(comp_buf, 1, td1[2], file); + unsigned long size = td1[1]; + if(sz != td1[2]) ok = false; + else { + int ret = uncompress(ucomp_buf, &size, comp_buf, td1[2]); + if(ret) ok = false; + } + if(ok) { + hdr->n_dbSNP_prefixes = *((uint16_t *)(ucomp_buf + 2)); + n_ctgs = *((uint32_t *)(ucomp_buf + 4)); + ctgs = calloc(n_ctgs, sizeof(dbsnp_ctg_t)); + char *p = ucomp_buf + 8; + char *p1 = ucomp_buf + size; + size_t l = strlen(p); + if(p + 8 >= p1 || strncmp(p, "track ", 6)) ok = false; + else { + hdr->dbSNP_header = malloc(l - 5); + memcpy(hdr->dbSNP_header, p + 6, l - 5); + hdr->dbSNP_prefix = malloc(sizeof(void *) * hdr->n_dbSNP_prefixes); + p += l + 1; + } + for(int i = 0; ok && i < hdr->n_dbSNP_prefixes && p < p1; i++) { + l = strlen(p); + if(p + l >= p1) ok = false; + else { + hdr->dbSNP_prefix[i] = malloc(l + 1); + memcpy(hdr->dbSNP_prefix[i], p, l + 1); + p += l + 1; + } + } + uint32_t min_bin = 0, max_bin = 0; + for(int i = 0; ok && i < n_ctgs && p < p1; i++) { + if(p + 8 >= p1) ok = false; + else { + memcpy(&min_bin, p, sizeof(uint32_t)); + memcpy(&max_bin, p + 4, sizeof(uint32_t)); + if(max_bin < min_bin) { + ok = false; + } + else p += 8; + } + if(!ok) break; + l = strlen(p); + if(p + l >= p1) ok = false; + else { + ctgs[i].min_bin = min_bin; + ctgs[i].max_bin = max_bin; + ctgs[i].name = malloc(l + 1); + memcpy(ctgs[i].name, p, l + 1); + p += l + 1; + } + } + } + } + if(!ok) { + free(hdr); + fclose(file); + return NULL; + } + fseek(file, td1[0], SEEK_SET); + for(int i = 0; i < n_ctgs; i++) { + size_t k = fread(&ctgs[i].file_offset, sizeof(uint64_t), 1, file); + if(k != 1) { + ok = false; + break; + } + dbsnp_ctg_t *ctg; + HASH_FIND(hh, hdr->dbSNP, ctgs[i].name, strlen(ctgs[i].name), ctg); + if(ctg != NULL) { + fprintf(stderr,"Error in dbSNP file - duplicate contigs (%s)\n", ctgs[i].name); + ok = false; + break; + } + HASH_ADD_KEYPTR(hh, hdr->dbSNP, ctgs[i].name, strlen(ctgs[i].name), ctgs + i); + } + if(comp_buf) free(comp_buf); + if(ucomp_buf) free(ucomp_buf); + if(ok) fprintf(stderr, "dbSNP index loaded OK\n"); + else { + fprintf(stderr, "dbSNP index loading failed\n"); + free(hdr); + fclose(file); + hdr = NULL; + } + return hdr; +} + +void unload_dbSNP_ctg(dbsnp_ctg_t * const ctg) { + if(ctg && ctg->bins) { + dbsnp_bin_t *bin = ctg->bins; + for(uint32_t bn = ctg->min_bin; bn <= ctg->max_bin; bn++, bin++) { + if(bin->n_entries) { + free(bin->entries); + free(bin->name_buf); + } + } + free(ctg->bins); + ctg->bins = NULL; + } +} + +bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg) { + bool ok = true; + + static uint8_t db_tab[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, + 0x47, 0x48, 0x49, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x60, 0x61, 0x62, + 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + 0x79, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x90, 0x91, 0x92, 0x93, 0x94, + 0x95, 0x96, 0x97, 0x98, 0x99, 0x0f, 0x1f, 0x2f, 0x3f, 0x4f, 0x5f, 0x6f, 0x7f, 0x8f, 0x9f, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }; + + fprintf(stderr,"Loading dbSNP entries for %s\n", ctg->name); + FILE * const file = hdr->fp; + + void * const ucomp_buf = malloc(hdr->dbSNP_bufsize); + size_t comp_buf_size = 1 + hdr->dbSNP_bufsize * .75; + void *comp_buf = malloc(comp_buf_size); + fseek(file, ctg->file_offset, SEEK_SET); + uint16_t *entries = malloc(sizeof(uint16_t) * 64); + uint8_t *name_buf = malloc(sizeof(uint8_t) * 256 * 64); + int n_snps = 0, n_bins = 0; + ctg->bins = calloc((ctg->max_bin - ctg->min_bin + 1), sizeof(dbsnp_bin_t)); + dbsnp_bin_t *bins = ctg->bins; + uint32_t curr_bin = ctg->min_bin; + while(ok) { + uint64_t sz; + size_t k = fread(&sz, sizeof(uint64_t), 1, file); + unsigned long size; + if(k != 1) ok = false; + else if(sz == 0) break; + else { + if(comp_buf_size < sz) { + comp_buf_size = sz * 1.1; + comp_buf = realloc(comp_buf, comp_buf_size); + } + k = fread(comp_buf, 1, sz, file); + if(k != sz) ok = false; + else { + size = hdr->dbSNP_bufsize; + int ret = uncompress(ucomp_buf, &size, comp_buf, sz); + if(ret) ok = false; + } + } + if(!ok) break; + uint8_t *bp = ucomp_buf; + uint8_t *bp_end = ucomp_buf + size; + int n_entries = 0, name_buf_ptr = 0; + bool end_of_bin = false; + int prev_ix = -1; + while(ok && bp < bp_end) { + if(!n_entries) { + uint32_t bin_inc = 0; + uint8_t x = *bp++; + switch(x & 3) { + case 0: + bin_inc = x >> 2; + break; + case 1: + if(bp < bp_end) { + x = *bp++; + bin_inc = x; + } else ok = false; + break; + case 2: + if(bp + 1 < bp_end) { + uint16_t k; + memcpy(&k, bp, 2); + bp += 2; + bin_inc = k; + + } else ok = false; + break; + case 3: + if(bp + 3 < bp_end) { + uint32_t k; + memcpy(&k, bp, 4); + bp += 4; + bin_inc = k; + } else ok = false; + break; + } + if(!ok) break; + curr_bin += bin_inc; + if(curr_bin > ctg->max_bin || bp >= bp_end) { + ok = false; + break; + } + bins += bin_inc; + } + uint8_t x = *bp++; + int prefix_ix = x >> 6; + int sl; + if(!prefix_ix) { + if(bp + 2 < bp_end) { + name_buf[name_buf_ptr++] = *bp++; + name_buf[name_buf_ptr++] = *bp++; + sl = 2; + } else ok = false; + } else sl = 0; + if(!ok) break; + if((x & 63) <= prev_ix || prefix_ix > hdr->n_dbSNP_prefixes) ok = false; + else { + prev_ix = x & 63; + int k = name_buf_ptr; + while((*bp) > 1 && sl++ < 256 && bp < bp_end) { + name_buf[name_buf_ptr++] = db_tab[(int)(*bp++)]; + } + k = name_buf_ptr - k; + if(*bp > 1) ok = false; + else { + if(*bp++ == 1) end_of_bin = true; + if(n_entries == 64) ok = false; + else entries[n_entries++] = (k << 8) | (uint16_t)x; + } + } + if(!ok) break; + if(end_of_bin) { + store_dbsnp_entries(bins, n_entries, name_buf_ptr, entries, name_buf); + n_bins++; + n_snps += n_entries; + n_entries = 0; + name_buf_ptr = 0; + prev_ix = -1; + end_of_bin = false; + } + } + } + fprintf(stderr, "ctg loaded: %s, n_snps = %d\n", ok ? "OK" : "BAD", n_snps); + free(ucomp_buf); + free(comp_buf); + free(entries); + free(name_buf); + return ok; +} + +bool dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, char * const rs, size_t * const rs_len, const uint32_t x) { + static char dtab[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0, 0, 0, 0, 0, 0 }; + bool found = false; + rs[0] = 0; + if(ctg != NULL) { + int bn = x >> 6; + if(bn >= ctg->min_bin && bn <= ctg->max_bin) { + dbsnp_bin_t *b = ctg->bins + bn - ctg->min_bin; + int ix = x & 63; + uint64_t mk = (uint64_t)1 << ix; + if(b->mask & mk) { + uint64_t mk1 = b->mask & (mk - (uint64_t)1); + int i = 0, j = 0; + while(mk1) { + if(mk1 & (uint64_t)1) { + uint16_t en = b->entries[i++]; + j += en >> 8; + if(!((en >> 6) & 3)) j += 2; + } + mk1 >>= 1; + } + char *tp = rs; + int prefix_id = (b->entries[i] >> 6) & 3; + unsigned char *tp1 = b->name_buf + j; + if((prefix_id--) == 0) { + prefix_id = (tp1[0] << 8) | tp1[1]; + tp1+=2; + } + char * db_prefix = hdr->dbSNP_prefix[prefix_id]; + j = 0; + while(db_prefix[j]) *tp++ = db_prefix[j++]; + j = b->entries[i] >> 8; + for(int k = 0; k < j; k++) { + unsigned char z = *tp1++; + *tp++ = dtab[z >> 4]; + *tp++ = dtab[z & 15]; + } + *tp = 0; + if(rs_len) *rs_len = tp - rs; + found = true; + } + } + } + return found; +} diff --git a/tools/utils/common/dbSNP.h b/tools/utils/common/dbSNP.h new file mode 100644 index 00000000..caf706ec --- /dev/null +++ b/tools/utils/common/dbSNP.h @@ -0,0 +1,45 @@ +/* + * dbSNP_struct.h + * + * Created on: Jan 23, 2020 + * Author: heath + */ +#include +#include +#include "uthash.h" + +#ifndef INCLUDE_DBSNP_H_ +#define INCLUDE_DBSNP_H_ + +typedef struct { + uint64_t mask; + int n_entries; + uint16_t *entries; + uint8_t *name_buf; +} dbsnp_bin_t; + +typedef struct { + char *name; + int min_bin; + int max_bin; + uint64_t file_offset; + dbsnp_bin_t *bins; + UT_hash_handle hh; +} dbsnp_ctg_t; + +typedef struct { + char *filename; + FILE *fp; + dbsnp_ctg_t *dbSNP; + uint16_t n_dbSNP_prefixes; + size_t dbSNP_bufsize; + char **dbSNP_prefix; + char *dbSNP_header; +} dbsnp_header_t; + +dbsnp_header_t * load_dbSNP_header(char * const filename); +bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg); +void unload_dbSNP_ctg(dbsnp_ctg_t * const ctg); +bool dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, char * const rs, size_t * const rs_len, const uint32_t x); + +#endif /* INCLUDE_DBSNP_H_ */ diff --git a/tools/utils/common/uthash.h b/tools/utils/common/uthash.h new file mode 120000 index 00000000..b017556b --- /dev/null +++ b/tools/utils/common/uthash.h @@ -0,0 +1 @@ +../../bs_call/include/uthash.h \ No newline at end of file diff --git a/tools/utils/utils.c b/tools/utils/common/utils.c similarity index 100% rename from tools/utils/utils.c rename to tools/utils/common/utils.c diff --git a/tools/utils/utils.h b/tools/utils/common/utils.h similarity index 99% rename from tools/utils/utils.h rename to tools/utils/common/utils.h index 06d8b8f0..14116c7d 100644 --- a/tools/utils/utils.h +++ b/tools/utils/common/utils.h @@ -4,7 +4,7 @@ #define DEFAULT_PATH "/bin:/usr/bin:/usr/local/bin"; #define READ 0 #define WRITE 1 - +/* #ifndef __unused__ #if defined(__GNUC__) # define __unused__ __attribute__((unused)) @@ -12,7 +12,7 @@ # define __unused__ #endif #endif - +*/ #define COMPRESS_GZIP 0 #define COMPRESS_BZIP2 1 #define COMPRESS_XZ 2 diff --git a/tools/utils/bbi.c b/tools/utils/mextr/bbi.c similarity index 100% rename from tools/utils/bbi.c rename to tools/utils/mextr/bbi.c diff --git a/tools/utils/bbi.h b/tools/utils/mextr/bbi.h similarity index 100% rename from tools/utils/bbi.h rename to tools/utils/mextr/bbi.h diff --git a/tools/utils/bbi_defs.h b/tools/utils/mextr/bbi_defs.h similarity index 100% rename from tools/utils/bbi_defs.h rename to tools/utils/mextr/bbi_defs.h diff --git a/tools/utils/bbi_structs.h b/tools/utils/mextr/bbi_structs.h similarity index 100% rename from tools/utils/bbi_structs.h rename to tools/utils/mextr/bbi_structs.h diff --git a/tools/utils/calc_gt_prob.c b/tools/utils/mextr/calc_gt_prob.c similarity index 100% rename from tools/utils/calc_gt_prob.c rename to tools/utils/mextr/calc_gt_prob.c diff --git a/tools/utils/command_line.c b/tools/utils/mextr/command_line.c similarity index 99% rename from tools/utils/command_line.c rename to tools/utils/mextr/command_line.c index d19f3abd..ee435376 100644 --- a/tools/utils/command_line.c +++ b/tools/utils/mextr/command_line.c @@ -40,7 +40,7 @@ const char *usage(void) { "About: Extract CpG and nonCpG sites.\n" "Usage: mextr [file] [regions]\n" "Options:\n" - " -o, --cpgfile Output file for CpG sites (default = stdout)\n" + " -o, --cpgfile Output file for CpG sites (default, not output)\n" " -n, --noncpgfile Output file for nonCpG sites (default, not output)\n" " -b, --bed-methyl Output file base for bedMethly files. Not compatible with multi-sample files (default, not output)\n" " -t, --bed-track-line Track line for for bedMethly files (default, info taken from input VCF file)\n" diff --git a/tools/utils/files.c b/tools/utils/mextr/files.c similarity index 100% rename from tools/utils/files.c rename to tools/utils/mextr/files.c diff --git a/tools/utils/init_params.c b/tools/utils/mextr/init_params.c similarity index 100% rename from tools/utils/init_params.c rename to tools/utils/mextr/init_params.c diff --git a/tools/utils/mextr.c b/tools/utils/mextr/mextr.c similarity index 100% rename from tools/utils/mextr.c rename to tools/utils/mextr/mextr.c diff --git a/tools/utils/mextr.h b/tools/utils/mextr/mextr.h similarity index 100% rename from tools/utils/mextr.h rename to tools/utils/mextr/mextr.h diff --git a/tools/utils/output.c b/tools/utils/mextr/output.c similarity index 100% rename from tools/utils/output.c rename to tools/utils/mextr/output.c diff --git a/tools/utils/output_headers.c b/tools/utils/mextr/output_headers.c similarity index 100% rename from tools/utils/output_headers.c rename to tools/utils/mextr/output_headers.c diff --git a/tools/utils/output_utils.c b/tools/utils/mextr/output_utils.c similarity index 100% rename from tools/utils/output_utils.c rename to tools/utils/mextr/output_utils.c diff --git a/tools/utils/rec.c b/tools/utils/mextr/rec.c similarity index 100% rename from tools/utils/rec.c rename to tools/utils/mextr/rec.c diff --git a/tools/utils/stats.c b/tools/utils/mextr/stats.c similarity index 100% rename from tools/utils/stats.c rename to tools/utils/mextr/stats.c diff --git a/tools/utils/unpack.c b/tools/utils/mextr/unpack.c similarity index 100% rename from tools/utils/unpack.c rename to tools/utils/mextr/unpack.c diff --git a/tools/utils/readNameClean.c b/tools/utils/readNameClean.c deleted file mode 100644 index 29d0817f..00000000 --- a/tools/utils/readNameClean.c +++ /dev/null @@ -1,139 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include - -#include "utils.h" -#include "uthash.h" - -// Strip illegal characters from Read IDs in SAM file -// Valid characters are [!-?A-~] - -// Option to edit SAM headers, adding extra information to the @SQ lines - -#define NUM_SQTAGS 4 -static char *sqtags[NUM_SQTAGS] = { - "LN", "M5", "AS", "SP" -}; - -typedef struct { - char *name; - char *tags[NUM_SQTAGS]; - UT_hash_handle hh; -} ctg_t; - -ctg_t *process_ctg_file(char *name) { - ctg_t *ctgs = NULL; - bool flag; - FILE *fp = open_readfile(name, &flag); - if(fp == NULL) { - fprintf(stderr, "Could not open %s for reading\n", name); - exit(-1); - } - char *buf = NULL; - size_t buf_size = 0, tlen = 0; - ssize_t l; - tokens *tok = NULL; - while(1) { - l = getline(&buf, &buf_size, fp); - if(l < 0) break; - tok = tokenize(buf, '\t', tok); - if(tok->n_tok > 1) { - ctg_t *ct = NULL; - HASH_FIND_STR(ctgs, tok->toks[0], ct); - if(ct != NULL) { - fprintf(stderr, "process_ctg_file(): error - duplicate contig %s\n", tok->toks[0]); - exit(-1); - } - ct = malloc(sizeof(ctg_t)); - ct->name = strdup(tok->toks[0]); - for(int i = 0; i < NUM_SQTAGS; i++) ct->tags[i] = NULL; - for(int i = 1; i < tok->n_tok; i++) { - const char * const p = tok->toks[i]; - for(int j = 0; j < NUM_SQTAGS; j++) { - if(!strncmp(p, sqtags[j], 2) && p[2] == ':') { - ct->tags[j] = strdup(p + 3); - break; - } - } - } - HASH_ADD_KEYPTR(hh, ctgs, ct->name, strlen(ct->name), ct); - } - } - fclose(fp); - if(flag) while(waitpid(-1, NULL, 0) > 0); - if(tok != NULL) free_tokens(tok); - if(buf != NULL) free(buf); - return ctgs; -} - -int main(int argc, char *argv[]) { - FILE *fp = stdin; - char *buf = NULL; - size_t buf_size = 0; - ssize_t l; - ctg_t *ctgs = NULL; - - if(argc > 1) ctgs = process_ctg_file(argv[1]); - // Process header lines - no conversion - while(1) { - l = getline(&buf, &buf_size, fp); - if(l < 0) return 0; - if(buf[0] != '@') break; - bool pflag = true; - if(l > 8 && !strncmp(buf + 1, "SQ\tSN:", 6)) { - char *p = buf + 7; - char *p1 = p; - while(*p1 && *p1 != '\t' && *p1 != '\n') p1++; - size_t l = p1 - p; - ctg_t *ct = NULL; - HASH_FIND(hh, ctgs, p, l, ct); - if(ct) { - pflag = false; - int mask = 0; - char c = *p1; - *p1 = 0; - fputs(buf, stdout); - while(c == '\t') { - p1++; - p = p1; - for(int j = 0; j < NUM_SQTAGS; j++) { - if(!strncmp(p1, sqtags[j], 2) && p1[2] == ':') { - mask |= (1 << j); - break; - } - } - while(*p1 && *p1 != '\t' && *p1 != '\n') p1++; - c = *p1; - *p1 = 0; - printf("\t%s", p); - } - int j = 0; - for(int j = 0; j < NUM_SQTAGS; j++) { - if(ct->tags[j] != NULL && !(mask & (1 << j))) printf("\t%s:%s", sqtags[j], ct->tags[j]); - } - fputc('\n', stdout); - } - } - if(pflag) fputs(buf, stdout); - } - // Process the rest of the file - while(l >= 0) { - int i; - bool found = false; - for(i = 0; i < l && buf[i] != '\t'; i++) if((found = (buf[i] == '@' || buf[i] < '!' || buf[i] > '~'))) break; - if(found) { - int j = i; - for(i = i + 1; i < l && buf[i] != '\t'; i++) { - if(buf[i] != '@' && buf[i] >= '!' && buf[i] <= '~') buf[j++] = buf[i]; - } - for(; i <= l; i++) buf[j++] = buf[i]; - } - fputs(buf, stdout); - l = getline(&buf, &buf_size, fp); - } - if(buf) free(buf); - return 0; -} diff --git a/tools/utils/snpxtr/command_line.c b/tools/utils/snpxtr/command_line.c new file mode 100644 index 00000000..e9bc962d --- /dev/null +++ b/tools/utils/snpxtr/command_line.c @@ -0,0 +1,162 @@ +/* + * command_line.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "htslib/hfile.h" +#include "htslib/khash_str2int.h" + +#include "utils.h" +#include "snpxtr.h" + +// These are copied from htslib:synced_bcf_reader.c as the definitions are not visible +// in the standard library and we need them to allow sorting of regions + +typedef struct { + hts_pos_t start, end; +} region1_t; + +struct _region_t { + region1_t *regs; + int nregs, mregs, creg; +}; + +const char *usage(void) { + return + "\n" + "About: Extract SNPs from VCF/BCF file.\n" + "Usage: snpxtr [file] [regions]\n" + "Options:\n" + " -o, --output Output file (default = stdout)\n" + " -s, --snps File with list of SNPs to be selected (default, select all sites with PASS)\n" + " -D, --dbsnp dbSNP index file (used to add external ids if not present in input file\n" + " -r, --regions restrict to comma separated list of regions\n" + " -R, --regions-file restrict to regions listed in file\n" + " -@, --threads Extra threads\n" + " -z, --bgzip Compress output with bgzip\n" + " -m, --md5 Calculate md5 digest for output file (if not stdout)\n" + " -x, --tabix Generate tabix (tbx) index for compressed output file\n" + "\n"; +} + +static struct option loptions[] = { + {"output",required_argument,0,'o'}, + {"snps",required_argument,0,'s'}, + {"dbsnp",required_argument,0,'D'}, + {"regions",required_argument,0,'r'}, + {"regions-file",required_argument,0,'R'}, + {"regions",required_argument,0,'r'}, + {"threads",required_argument,0,'@'}, + {"bgzip",no_argument,0,'z'}, + {"md5",no_argument,0,'m'}, + {"tabix",no_argument,0,'x'}, + {"help",no_argument,0,'h'}, + {0,0,0,0} +}; + + +void handle_command_line(int argc, char *argv[], sargs_t * const args) { + int c; + bool regions_file = false; + char *regions_list = NULL; + while ((c = getopt_long(argc, argv, "o:s:D:r:R:@:zmxh?",loptions,NULL)) >= 0) { + switch (c) { + case 'o': + args->outfilename = optarg; + break; + case 's': + args->snplistname = optarg; + break; + case 'D': + args->dbSNPfilename = optarg; + break; + case 'x': + args->tabix = true; + break; + case 'm': + args->md5 = true; + break; + case 'R': + regions_file = true; + // fall through + case 'r': + regions_list = optarg; + break; + case '@': + args->threads = atoi(optarg); + if(args->threads < 0) args->threads = 0; + break; + case 'z': + args->compress = true; + break; + case 'h': + case '?': + default: error(usage()); break; + } + } + char *fname = NULL; + if(optind == argc) error(usage()); + else fname = argv[optind]; + args->sr = bcf_sr_init(); + bcf_sr_set_threads(args->sr, args->threads); + // Process region arguments if present + if(regions_list) { + if(bcf_sr_set_regions(args->sr, regions_list, regions_file) < 0) error("Failed to parse the regions: %s\n", regions_list); + } else if(optind + 1 < argc) { + kstring_t tmp = {0, 0, 0}; + kputs(argv[optind + 1], &tmp); + for(int k = optind + 2; k < argc; k++) { + kputc(',', &tmp); + kputs(argv[k], &tmp); + } + if(bcf_sr_set_regions(args->sr, tmp.s, 0) < 0) error("Failed to parse the regions: %s\n", tmp.s); + free(tmp.s); + } + if(args->threads > 0) bcf_sr_set_threads(args->sr, args->threads); + if(!bcf_sr_add_reader(args->sr, fname)) + error("failed to read from %s: %s\n", fname, bcf_sr_strerror(args->sr->errnum)); + args->hdr = args->sr->readers[0].header; + args->pass_idx = args->gt_idx = -1; + for(int i = 0; i < args->hdr->n[BCF_DT_ID]; i++) { + if(args->pass_idx < 0 && !strcasecmp("PASS", args->hdr->id[BCF_DT_ID][i].key)) args->pass_idx = i; + if(args->gt_idx < 0 && !strcasecmp("GT", args->hdr->id[BCF_DT_ID][i].key)) args->gt_idx = i; + } + int ns = bcf_hdr_nsamples(args->hdr); + assert(ns > 0); + args->gt = malloc(sizeof(int) * ns); + bcf_sr_regions_t * const reg = args->sr->regions; + int nctgs_vcf = args->hdr->n[BCF_DT_CTG]; + args->cumul_len = malloc(nctgs_vcf * sizeof(uint64_t)); + if(reg) { + for(int i = 0; i < nctgs_vcf; i++) { + const bcf_idpair_t * const idp = args->hdr->id[BCF_DT_CTG] + i; + int k; + uint64_t len = 0; + int ret = khash_str2int_get(args->sr->regions->seq_hash, idp->key, &k); + if(ret >= 0) len = reg->regs[k].regs->end + 1; + args->cumul_len[i] = i ? args->cumul_len[i - 1] + len : len; + } + int nctgs = reg->nseqs; + } else { + for(int i = 0; i < nctgs_vcf; i++) { + const bcf_idpair_t * const idp = args->hdr->id[BCF_DT_CTG] + i; + uint64_t len = idp->val->info[0]; + args->cumul_len[i] = i ? args->cumul_len[i - 1] + len : len; + } + } + if(!args->outfilename) args->outfilename = "-"; + args->outfile = open_ofile(&args->outfilename, args->compress, args); +} diff --git a/tools/utils/snpxtr/files.c b/tools/utils/snpxtr/files.c new file mode 100644 index 00000000..9809fc67 --- /dev/null +++ b/tools/utils/snpxtr/files.c @@ -0,0 +1,111 @@ +/* + * files.c + * + * Created on: Jan 24, 2020 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "snpxtr.h" +#include "htslib/bgzf.h" +#include "htslib/hfile.h" + +htsFile *open_ofile(char ** const name, bool compress, sargs_t * const a) { + + htsFile *fp = NULL; + if(name != NULL) { + char *tname = *name; + char mode[3] = {'w', 0, 0}; + bool stream = !strcmp(tname, "-"); + + // If output is to a file then if compression has been asked for + // we add '.gz' to the filename unless already present. If compression + // had not been asked for but the filename ends in '.gz' then we + // turn on compression + + if(!stream) { + // Check if file name ends in '.gz' + char *p = strrchr(tname, '.'); + bool has_gz = p && !strcmp(p + 1, "gz"); + if(compress) { + if(!has_gz) { + tname = malloc(strlen(*name) + 4); + sprintf(tname, "%s.gz", *name); + } + } else compress = has_gz; + } else { + // Turn off compression if output is to a terminal + if(compress && isatty(fileno(stdout))) compress = false; + } + if(compress) mode[1] = 'z'; + hFILE *hfile = hopen(tname, mode); + if(!hfile) error("Couldn't open output file: %s\n", stream ? "" : tname); + fp = hts_hopen(hfile, tname, mode); + if(a->threads > 0) hts_set_opt(fp, HTS_OPT_THREAD_POOL, a->sr->p); + if(tname != *name) *name = tname; + } + return fp; +} + +#define MD5_BUF_SIZE 4096 + +void calc_stream_md5(FILE * const fp, char * const md5) { + MD5_CTX ctx; + MD5_Init(&ctx); + uint8_t buf[MD5_BUF_SIZE]; + while(!feof(fp)) { + size_t len = fread(buf, 1, MD5_BUF_SIZE, fp); + if(len > 0) MD5_Update(&ctx, buf, len); + } + unsigned char b[16]; + const char *hex_digits="0123456789abcdef"; + MD5_Final(b, &ctx); + int k = 0; + for(int i = 0; i < 16; i++) { + md5[k++] = hex_digits[b[i] >> 4]; + md5[k++] = hex_digits[b[i] & 0xf]; + } + md5[k] = 0; +} + +void calc_file_md5(char * const name) { + char *tname = malloc(strlen(name) + 5); + sprintf(tname, "%s.md5", name); + FILE *in = fopen(name, "rb"); + FILE *out = NULL; + int err = 0; + if(in == NULL) { + fprintf(stderr, "calc_file_md5(): Could not open file %s for input: %s\n", name, strerror(errno)); + err = 1; + } else { + out = fopen(tname, "wb"); + if(out == NULL) { + fprintf(stderr, "calc_file_md5(): Could not open file %s for output: %s\n", tname, strerror(errno)); + err = 2; + } + } + char md5[33]; + if(!err) { +#ifndef __MACH__ + posix_fadvise(fileno(in), 0, 0, POSIX_FADV_SEQUENTIAL); +#endif + calc_stream_md5(in, md5); + fprintf(out,"%s %s\n", md5, name); + } + if(out) fclose(out); + if(in) fclose(in); + free(tname); +} + + + diff --git a/tools/utils/snpxtr/init_params.c b/tools/utils/snpxtr/init_params.c new file mode 100644 index 00000000..33e66e5d --- /dev/null +++ b/tools/utils/snpxtr/init_params.c @@ -0,0 +1,25 @@ +/* + * init_params.c + * + * Created on: Dec 26, 2019 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "snpxtr.h" + +#include "htslib/hfile.h" + +void init_params(sargs_t *const args) { + memset(args, 0, sizeof(sargs_t)); + ks_initialize(&args->out_string); +} diff --git a/tools/utils/snpxtr/process.c b/tools/utils/snpxtr/process.c new file mode 100644 index 00000000..f3ff4190 --- /dev/null +++ b/tools/utils/snpxtr/process.c @@ -0,0 +1,197 @@ +/* + * process.c + * + * Created on: Jan 23, 2020 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "snpxtr.h" + +#include +#include "htslib/bgzf.h" +#include "htslib/hfile.h" +#include "htslib/tbx.h" +#include + +static int base_tab[256] = { + ['A'] = 1, ['C'] = 2, ['G'] = 3, ['T'] = 4 +}; +static void * dat_p = NULL; +static int dat_n = 0; + +KHASH_SET_INIT_STR(str); + +void make_tabix_index(char * const fname) { + tbx_conf_t conf = tbx_conf_vcf; + tbx_index_build(fname, 0, &conf); +} + +void *md5_thread(void *p) { + calc_file_md5(p); + return NULL; +} + +void process_input(sargs_t * const args) { + dbsnp_header_t * const dbsnp_hdr = args->dbSNP_hdr; + dbsnp_ctg_t *dbSNP_ctg = NULL; + char rs[512]; + int rid = -1; + bcf_srs_t * const sr = args->sr; + bcf1_t *rec = bcf_init(); + khash_t(str) *h = args->snp_hash; + + while(bcf_sr_next_line(sr)) { + bcf_sr_swap_line(sr, 0, rec); + if(rec->rid != rid) { + if(dbsnp_hdr) { + const char * const cname = args->hdr->id[BCF_DT_CTG][rec->rid].key; + if(dbSNP_ctg) unload_dbSNP_ctg(dbSNP_ctg); + HASH_FIND(hh, dbsnp_hdr->dbSNP, cname, strlen(cname), dbSNP_ctg); + if(dbSNP_ctg) { + bool ret = load_dbSNP_ctg(dbsnp_hdr, dbSNP_ctg); + if(!ret) dbSNP_ctg = NULL; + } + } + rid = rec->rid; + } + int ns = bcf_hdr_nsamples(args->hdr); + bcf_unpack(rec, BCF_UN_ALL); + char *id = rec->d.id; + bool rs_id; + if(id[0]=='.' && id[1]==0) { + rs_id = false; + if(dbSNP_ctg) { + if(dbSNP_lookup_name(dbsnp_hdr, dbSNP_ctg, rs, NULL, rec->pos + 1)) { + id = rs; + rs_id = true; + } + } + } else rs_id = true; + if(rs_id) { + bool passed = true; + if(h) { + khint_t k = kh_get(str, h, id); + if(k == kh_end(h)) passed = false; + } + if(passed) { + passed = false; + for(int i = 0; i < rec->d.n_flt; i++) if(rec->d.flt[i] == args->pass_idx) { + passed = true; + break; + } + } + // Check for SNP + int n_all = rec->n_allele; + if(passed) { + if(n_all > 4) passed = false; + else for(int i = 0; i < n_all; i++) { + char * const p = rec->d.allele[i]; + if(p[1] || !base_tab[(int)p[0]]) { + passed = false; + break; + } + } + } + if(passed) { + int ne = bcf_get_format_values(args->hdr, rec, "FT", &dat_p, &dat_n, BCF_HT_STR); + int gt_ix = -1; + bcf_fmt_t * const fmt = rec->d.fmt; + // Get GT Tag + for(int i = 0; i < (int)rec->n_fmt; i++) { + if(!fmt[i].p) continue; + if(fmt[i].id == args->gt_idx) { + gt_ix = i; + break; + } + } + if(gt_ix >= 0) { + int sz = ne / ns; + char *flt = dat_p; + bcf_fmt_t *fmt = rec->d.fmt + gt_ix; + passed = false; + for(int i = 0; i < ns; i++) { + args->gt[i] = 0; + switch(fmt->type) { + case BCF_BT_INT8: + { + if(fmt->n == 2) { + int8_t *p = (int8_t *)(fmt->p + i * fmt->size); + if(p[0] != bcf_int8_vector_end && p[1] != bcf_int8_vector_end) { + int a1 = p[0] >> 1; + int a2 = p[1] >> 1; + if(a1 < 1 || a1 > n_all || a2 < 1 || a2 > n_all) args->gt[i] = 0; + else args->gt[i] = (a1 << 4) | a2; + } + } + } + break; + case BCF_BT_INT16: + { + if(fmt->n == 2) { + int16_t *p = (int16_t *)(fmt->p + i * fmt->size); + if(p[0] != bcf_int16_vector_end && p[1] != bcf_int16_vector_end) { + int a1 = p[0] >> 1; + int a2 = p[1] >> 1; + if(a1 < 1 || a1 > n_all || a2 < 1 || a2 > n_all) args->gt[i] = 0; + else args->gt[i] = (a1 << 4) | a2; + } + } + } + break; + case BCF_BT_INT32: + { + if(fmt->n == 2) { + int32_t *p = (int32_t *)(fmt->p + i * fmt->size); + if(p[0] != bcf_int32_vector_end && p[1] != bcf_int32_vector_end) { + int a1 = p[0] >> 1; + int a2 = p[1] >> 1; + if(a1 < 1 || a1 > n_all || a2 < 1 || a2 > n_all) args->gt[i] = 0; + else args->gt[i] = (a1 << 4) | a2; + } + } + } + break; + } + if(args->gt[i]) passed = true; + } + } + } + if(passed) { + kstring_t *s = ks_clear(&args->out_string); + ksprintf(s, "%s\t%" PRId64 "\t%s", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + 1, id); + for(int i = 0; i < ns; i++) { + const int gt = args->gt[i]; + if(gt > 0) ksprintf(s, "\t%s%s", rec->d.allele[(gt >> 4) -1], rec->d.allele[(gt & 7) -1]); + else kputs("\t00", s); + } + kputc('\n', s); + htsFile * const fp = args->outfile; + int r; + if((fp)->format.compression != no_compression) r = bgzf_write((fp)->fp.bgzf, (s)->s, (s)->l); + else r = hwrite((fp)->fp.hfile, (s)->s, (s)->l); + if(r != (s)->l) error("output error writing to %s\n", (fp)->fn ? (fp)->fn : ""); + } + } + } + if(dbSNP_ctg) unload_dbSNP_ctg(dbSNP_ctg); + bcf_destroy(rec); + if(strcmp(args->outfilename, "-")) { + hts_close(args->outfile); + pthread_t md5_th; + if(args->md5) pthread_create(&md5_th, NULL, md5_thread, args->outfilename); + if(args->tabix) make_tabix_index(args->outfilename); + if(args->md5) pthread_join(md5_th, NULL); + } +} diff --git a/tools/utils/snpxtr/snplist.c b/tools/utils/snpxtr/snplist.c new file mode 100644 index 00000000..63801634 --- /dev/null +++ b/tools/utils/snpxtr/snplist.c @@ -0,0 +1,63 @@ +/* + * snplist.c + * + * Created on: Jan 23, 2020 + * Author: heath + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "snpxtr.h" + +#include + +#include "utils.h" + +KHASH_SET_INIT_STR(str); + +void read_snplist(sargs_t * const args) { + bool filter; + FILE *fp = open_readfile_and_check(args->snplistname, &filter); + char *buf = NULL; + size_t buf_size = 0; + int n_snps = 0; + khash_t(str) *h; + h = kh_init(str); + fprintf(stderr, "Reading SNP list from %s\n", args->snplistname); + for(;;) { + ssize_t l = getline(&buf, &buf_size, fp); + if(l < 0) break; + int i; + for(i = 0; i < l; i++) if(!isspace(buf[i])) break; + int j = i; + for(; i < l; i++) if(isspace(buf[i])) break; + if(i == j) continue; + buf[i] = 0; + int not_found; + khint_t k = kh_put(str, h, buf + j, ¬_found); + if(not_found) { + n_snps++; + kh_key(h, k) = strdup(buf + j); + } + } + args->snp_hash = h; + fclose(fp); + if(buf) free(buf); + if(filter) { + int i; + while(waitpid(-1, &i, WNOHANG) > 0); + } + fprintf(stderr, "List of %d unique SNPs read in\n", n_snps); +} + diff --git a/tools/utils/snpxtr/snpxtr.c b/tools/utils/snpxtr/snpxtr.c new file mode 100644 index 00000000..3c17d807 --- /dev/null +++ b/tools/utils/snpxtr/snpxtr.c @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils.h" +#include "snpxtr.h" + +#include "htslib/hfile.h" +#include "htslib/bgzf.h" + +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + exit(-1); +} + +int main(int argc, char **argv) { + + sargs_t args; + init_params(&args); + handle_command_line(argc, argv, &args); + int ns = bcf_hdr_nsamples(args.hdr); + assert(ns > 0); + if(args.snplistname) read_snplist(&args); + if(args.dbSNPfilename) args.dbSNP_hdr = load_dbSNP_header(args.dbSNPfilename); +// init_files(&args); + process_input(&args); + fprintf(stderr,"snpxtr: finished\n"); + bcf_sr_destroy(args.sr); + return 0; +} + + diff --git a/tools/utils/snpxtr/snpxtr.h b/tools/utils/snpxtr/snpxtr.h new file mode 100644 index 00000000..8d7233d5 --- /dev/null +++ b/tools/utils/snpxtr/snpxtr.h @@ -0,0 +1,54 @@ +#ifndef MEXTR_H_ +#define MEXTR_H_ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dbSNP.h" + +#define LOG10 2.30258509299404568402 + +void error(const char *format, ...) HTS_NORETURN; + +typedef struct { + bcf_hdr_t *hdr; + bcf_srs_t *sr; + char *snplistname; + char *outfilename; + char *dbSNPfilename; + void *snp_hash; + int *gt; + dbsnp_header_t *dbSNP_hdr; + htsFile *outfile; + uint64_t *cumul_len; + kstring_t out_string; + int threads; + int pass_idx; + int gt_idx; + bool compress; + bool tabix; + bool md5; +} sargs_t; + +#define ks_output(fp, s) { \ + int r; \ + if((fp)->format.compression != no_compression) r = bgzf_write((fp)->fp.bgzf, (s)->s, (s)->l); \ + else r = hwrite((fp)->fp.hfile, (s)->s, (s)->l); \ + if(r != (s)->l) error("output error writing to %s\n", (fp)->fn ? (fp)->fn : ""); \ +} \ + +const char *usage(void); +void handle_command_line(int argc, char *argv[], sargs_t * const args); +void init_params(sargs_t *const args); +void read_snplist(sargs_t * const args); +void process_input(sargs_t * const args); +htsFile *open_ofile(char ** const name, bool compress, sargs_t * const a); +void calc_file_md5(char * const name); + +#endif // MEXTR_H_ diff --git a/tools/utils/uthash.h b/tools/utils/uthash.h deleted file mode 120000 index 4aedc371..00000000 --- a/tools/utils/uthash.h +++ /dev/null @@ -1 +0,0 @@ -../bs_call/include/uthash.h \ No newline at end of file From 8aeaf5eb1a6e945052aac8682debdaf956adddfb Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sun, 26 Jan 2020 11:06:33 +0100 Subject: [PATCH 30/61] remove old bcftools plugins and move readnameClean --- tools/Makefile | 6 +- tools/gemBS_plugins/calc_gt_prob.c | 327 -------- tools/gemBS_plugins/compress.c | 85 -- tools/gemBS_plugins/compress.h | 19 - tools/gemBS_plugins/mextr.c | 591 -------------- tools/gemBS_plugins/mextr.h | 109 --- tools/gemBS_plugins/mextr.mk | 2 - tools/gemBS_plugins/output.c | 407 ---------- tools/gemBS_plugins/snpxtr.c | 603 -------------- tools/gemBS_plugins/snpxtr.h | 56 -- tools/gemBS_plugins/snpxtr.mk | 2 - tools/gemBS_plugins/uthash.h | 917 ---------------------- tools/gemBS_plugins/utils.c | 386 --------- tools/gemBS_plugins/utils.h | 39 - tools/utils/readNameClean/readNameClean.c | 139 ++++ 15 files changed, 140 insertions(+), 3548 deletions(-) delete mode 100644 tools/gemBS_plugins/calc_gt_prob.c delete mode 100644 tools/gemBS_plugins/compress.c delete mode 100644 tools/gemBS_plugins/compress.h delete mode 100644 tools/gemBS_plugins/mextr.c delete mode 100644 tools/gemBS_plugins/mextr.h delete mode 100644 tools/gemBS_plugins/mextr.mk delete mode 100644 tools/gemBS_plugins/output.c delete mode 100644 tools/gemBS_plugins/snpxtr.c delete mode 100644 tools/gemBS_plugins/snpxtr.h delete mode 100644 tools/gemBS_plugins/snpxtr.mk delete mode 100644 tools/gemBS_plugins/uthash.h delete mode 100644 tools/gemBS_plugins/utils.c delete mode 100644 tools/gemBS_plugins/utils.h create mode 100644 tools/utils/readNameClean/readNameClean.c diff --git a/tools/Makefile b/tools/Makefile index 1619c38f..3c81fadb 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -70,7 +70,7 @@ gem3-mapper/Makefile.mk: gem3-mapper/Makefile.mk.in gem3-mapper/configure $(SAMTOOLS_DIR)/config.mk: cd $(SAMTOOLS_DIR); ./configure # --disable-lzma -$(FOLDER_BIN)/bcftools: $(BCFTOOLS_DIR) $(BCFTOOLS_DIR)/plugins/mextr.c $(BCFTOOLS_DIR)/plugins/snpxtr.c +$(FOLDER_BIN)/bcftools: $(BCFTOOLS_DIR) $(MAKE) --directory=$(BCFTOOLS_DIR) all $(BCFTOOLS_DIR)/config.h: @@ -85,10 +85,6 @@ $(BCFTOOLS_DIR): wget $(BCFTOOLS_URL) && tar -jxf $(BCFTOOLS_TAR) && rm -f $(BCFTOOLS_TAR) mv bcftools-$(BCFTOOLS_VERSION) $(BCFTOOLS_DIR) -$(BCFTOOLS_DIR)/plugins/%.c: - ln -sf ../../gemBS_plugins/$(notdir $@) $(BCFTOOLS_DIR)/plugins/ - ln -sf ../../gemBS_plugins/$(basename $(notdir $@)).mk $(BCFTOOLS_DIR)/plugins/ - clean: @rm -f *~ @rm -rf $(FOLDER_BIN) diff --git a/tools/gemBS_plugins/calc_gt_prob.c b/tools/gemBS_plugins/calc_gt_prob.c deleted file mode 100644 index e29a18aa..00000000 --- a/tools/gemBS_plugins/calc_gt_prob.c +++ /dev/null @@ -1,327 +0,0 @@ -#include -#include -#include -#include - -#include "mextr.h" - -typedef struct { - double e, k, ln_k[3]; -} qual_prob; - -static qual_prob q_prob[MAX_QUAL + 1]; - -void fill_base_prob_table(void) { - for (int q = 0; q <= MAX_QUAL; q++) { - double e = exp(-.1 * (double)q * LOG10); - if(e > .5) e = .5; - double k = e / (3.0 - 4.0 * e); - q_prob[q].e = e; - q_prob[q].k = k; - q_prob[q].ln_k[0] = log(k); - q_prob[q].ln_k[1] = log(0.5 + k); - q_prob[q].ln_k[2] = log(1.0 + k); - } -} - -static inline void get_Z(double x1, double x2, double k1, double k2, double l, double t, double *Z) { - double lpt = l + t; - double lmt = l - t; - double d = (x1 + x2) * lmt; - // w = 1, p = 1 - double sinm = (x1 * (lpt + 2.0 * k2) - x2 * (2.0 - lpt + 2.0 * k1)) / d; - if(sinm < -1.0) sinm = -1.0; - else if(sinm > 1.0) sinm = 1.0; - Z[0] = 0.5 * (lmt * sinm + 2.0 - lpt); - // w = 1, p = 1/2 - sinm = (x1 * (2.0 + lpt + 4.0 * k2) - x2 * (2.0 - lpt + 4.0 * k1)) / d; - if(sinm < -1.0) sinm = -1.0; - else if(sinm > 1.0) sinm = 1.0; - Z[1] = 0.5 * (lmt * sinm + 2.0 - lpt); - // w = 1/2, p = 1 - sinm = (x1 * (lpt + 4.0 * k2) - x2 * (2.0 - lpt + 4.0 * k1)) / d; - if(sinm < -1.0) sinm = -1.0; - else if(sinm > 1.0) sinm = 1.0; - Z[2] = 0.5 * (lmt * sinm + 2.0 - lpt); -} - -static void add_bias(double *ll, char rf, double ref_bias) { - double lrb = log(ref_bias); - double lrb1 = log(0.5 * (1.0 + ref_bias)); - memset(ll, 0, sizeof(double) * 10); - switch (rf) { - case 'A': - ll[0] = lrb; - ll[1] = ll[2] = ll[3] = lrb1; - break; - case 'C': - ll[4] = lrb; - ll[1] = ll[5] = ll[6] = lrb1; - break; - case 'G': - ll[7] = lrb; - ll[2] = ll[5] = ll[8] = lrb1; - break; - case 'T': - ll[9] = lrb; - ll[3] = ll[6] = ll[8] = lrb1; - break; - } -} - -// This function is taken from genotype_model.c in bs_call -// As far as possible the two functions should be kept in sync -// (Yes, a shared library would make more sense...to do) -void calc_gt_prob(gt_meth *gt, args_t *args, char rf) { - qual_prob qp[8]; - for(int i = 0; i < 8; i++) qp[i] = q_prob[gt->aqual[i]]; - double l = 1.0 - args->under_conv; - double t = args->over_conv; - double n[8]; - for (int i = 0; i < 8; i++) n[i] = (double)gt->counts[i]; - double ll[10]; - double ref_bias = args->ref_bias; - // Add in prior from reference - add_bias(ll, rf, ref_bias); - if (n[0]) { - ll[0] += n[0] * qp[0].ln_k[2]; // AA - double tz = n[0] * qp[0].ln_k[1]; - ll[1] += tz; // AC - ll[2] += tz; // AG - ll[3] += tz; // AT - tz = n[0] * qp[0].ln_k[0]; - ll[4] += tz; // CC - ll[5] += tz; // CG - ll[6] += tz; // CT - ll[7] += tz; // GG - ll[8] += tz; // GT - ll[9] += tz; // TT - } - if (n[1]) { - ll[4] += n[1] * qp[1].ln_k[2]; // CC - double tz = n[1] * qp[1].ln_k[1]; - ll[1] += tz; // AC - ll[5] += tz; // CG - ll[6] += tz; // CT - tz = n[1] * qp[1].ln_k[0]; - ll[0] += tz; // AA - ll[2] += tz; // AG - ll[3] += tz; // AT - ll[7] += tz; // GG - ll[8] += tz; // GT - ll[9] += tz; // TT - } - if (n[2]) { - ll[7] += n[2] * qp[2].ln_k[2]; // GG - double tz = n[2] * qp[2].ln_k[1]; - ll[2] += tz; // AG - ll[5] += tz; // CG - ll[8] += tz; // TG - tz = n[2] * qp[2].ln_k[0]; - ll[0] += tz; // AA - ll[1] += tz; // AC - ll[3] += tz; // AT - ll[4] += tz; // CC - ll[6] += tz; // CT - ll[9] += tz; // TT - } - if (n[3]) { - ll[9] += n[3] * qp[3].ln_k[2]; // TT - double tz = n[3] * qp[3].ln_k[1]; - ll[3] += tz; // AT - ll[6] += tz; // CT - ll[8] += tz; // GT - tz = n[3] * qp[3].ln_k[0]; - ll[0] += tz; // AA - ll[1] += tz; // AC - ll[2] += tz; // AG - ll[4] += tz; // CC - ll[5] += tz; // CG - ll[7] += tz; // GG - } - double Z[6] = {-1.0, -1.0, -1.0, -1.0, -1.0, -1.0}; - if (n[5] + n[7] > 0.0) { - get_Z(n[5], n[7], qp[5].k, qp[7].k, l, t, Z); - for(int k = 0; k < 3; k++) gt->cmeth[k] = (Z[k] - 1.0 + l) / (l - t); - } - if (n[4] + n[6] > 0.0) { - get_Z(n[6], n[4], qp[6].k, qp[4].k, l, t, Z+3); - for(int k = 0; k < 3; k++) gt->gmeth[k] = (Z[k + 3] - 1.0 + l) / (l - t); - } - if (n[4]) { - ll[0] += n[4] * qp[4].ln_k[2]; // AA - ll[2] += log(1.0 - 0.5 * Z[4] + qp[4].k) * n[4]; // AG - ll[7] += log(1.0 - Z[3] + qp[4].k) * n[4]; // GG - double tz = log(0.5 * (1.0 - Z[5]) + qp[4].k) * n[4]; - ll[5] += tz; // CG - ll[8] += tz; // GT - tz = n[4] * qp[4].ln_k[1]; - ll[1] += tz; // AC - ll[3] += tz; // AT - tz = n[4] * qp[4].ln_k[0]; - ll[4] += tz; // CC - ll[6] += tz; // CT - ll[9] += tz; // TT - } - if (n[5]) { - ll[4] += log(Z[0] + qp[5].k) * n[5]; // CC - double tz = log(0.5 * Z[2] + qp[5].k) * n[5]; - ll[1] += tz; // AC - ll[5] += tz; // CG - ll[6] += log(0.5 * Z[1] + qp[5].k) * n[5]; // CT - tz = n[5] * qp[5].ln_k[0]; - ll[0] += tz; // AA - ll[2] += tz; // AG - ll[3] += tz; // AT - ll[7] += tz; // GG - ll[8] += tz; // GT - ll[9] += tz; // TT - } - if (n[6]) { - ll[7] += log(Z[3] + qp[6].k) * n[6]; // GG - double tz = log(0.5 * Z[5] + qp[6].k) * n[6]; - ll[5] += tz; // CG - ll[8] += tz; // TG - ll[2] += log(0.5 * Z[4] + qp[6].k) * n[6]; // AG - tz = n[6] * qp[6].ln_k[0]; - ll[0] += tz; // AA - ll[1] += tz; // AC - ll[3] += tz; // AT - ll[4] += tz; // CC - ll[6] += tz; // CT - ll[9] += tz; // TT - } - if (n[7]) { - ll[9] += n[7] * qp[7].ln_k[2]; // TT - ll[4] += log(1.0 - Z[0] + qp[7].k) * n[7]; // CC - ll[6] += log(1.0 - 0.5 * Z[1] + qp[7].k) * n[7]; // CT - double tz = log(0.5 * (1.0 - Z[2]) + qp[7].k) * n[7]; - ll[1] += tz; // AC - ll[5] += tz; // CG - tz = n[7] * qp[7].ln_k[1]; - ll[3] += tz; // AT - ll[8] += tz; // GT - tz = n[7] * qp[7].ln_k[0]; - ll[0] += tz; // AA - ll[2] += tz; // AG - ll[7] += tz; // GG - } - double max = ll[0]; - int mx = 0; - for (int i = 1; i < 10; i++) { - if (ll[i] > max) { - max = ll[i]; - mx = i; - } - } - gt->max_gt = mx; - double sum = 0.0; - for (int i = 0; i < 10; i++) { - sum += exp(ll[i] - max); - } - sum = log(sum); - gt->sum = sum + max; - for (int i = 0; i < 10; i++) { - gt->gt_prob[i] = (ll[i] - max - sum); - } -} - -static int gt_idx[10][2] = { - {-1, -1}, // AA - {2, -1}, // AC - {-1, 1}, // AG - {-1, -1}, // AT - {0, -1}, // CC - {2, 2}, // CG - {1, -1}, // CT - {-1, 0}, // GG - {-1, 2}, // GT - {-1, -1} // TT -}; - -double get_meth(gt_meth *g, int idx) { - double m = -1.0; - int i = gt_idx[g->max_gt][idx]; - if(i >= 0) m = idx ? g->gmeth[i] : g->cmeth[i]; - return m; -} - -// Calculate combined methylation for a CpG using information from both strands -// if available, taking account of the called genotypes. If information is not -// available from both strands, use the single site estimate of methylation -void calc_cpg_meth(args_t *args, int ns, cpg_prob *cpg, gt_meth *g1, gt_meth *g2) { - double wval[3] = {1.0, 1.0, 0.5}; - double pval[3] = {1.0, 0.5, 1.0}; - for(int ix = 0; ix < ns; ix++) { - if(g1[ix].skip || g2[ix].skip) continue; - int gt1 = g1[ix].max_gt; - int gt2 = g2[ix].max_gt; - cpg[ix].max_gt[0] = gt1; - cpg[ix].max_gt[1] = gt2; - cpg[ix].prob_best = g1[ix].gt_prob[gt1] + g2[ix].gt_prob[gt2]; - cpg[ix].prob_cg = g1[ix].gt_prob[4] + g2[ix].gt_prob[7]; - // Calc meth - double n1[8], n2[8]; - qual_prob qp1[8], qp2[8]; - for (int i = 0; i < 8; i++) { - n1[i] = (double)g1[ix].counts[i]; - n2[i] = (double)g2[ix].counts[i]; - qp1[i] = q_prob[g1[ix].aqual[i]]; - qp2[i] = q_prob[g2[ix].aqual[i]]; - } - double l = 1.0 - args->under_conv; - double t = args->over_conv; - double g = (l - t) * 0.5; - double f = (2.0 - l - t) * 0.5; - double kc = qp1[5].k; - double kt = qp1[7].k; - double kg = qp2[6].k; - double ka = qp2[4].k; - int ix1 = gt_idx[gt1][0]; - int ix2 = gt_idx[gt2][1]; - if(ix1 >= 0) { - double w1 = wval[ix1]; - double p = pval[ix1]; - if(ix2 >= 0) { - double w2 = wval[ix2]; - double q = pval[ix2]; - // Get initial estimate - double m1 = (n1[5] + n2[6]) / (n1[5] + n2[6] + n1[7] * p + n2[4] * q); - double m = asin(2.0 * m1 - 1.0); - // Maximize using NR - for(int it = 0; it < 100; it++) { - double cosm = cos(m); - double sinm = sin(m); - double A = f + g * sinm; - double nm1 = g * p * w1 * cosm; - double d1 = p * w1 * A + kc; - double d2 = w1 * (1.0 - p * A) + kt; - double nm3 = g * q * w2 * cosm; - double d3 = q * w2 * A + kg; - double d4 = w2 * (1.0 - q * A) + ka; - double grad = nm1 * (n1[5] / d1 - n1[7] / d2) + nm3 * (n2[6] / d3 - n2[4] / d4); - if(fabs(grad) < 1.0e-8) { - m1 = 0.5 * (sinm + 1.0); - break; - } - double h = n1[5] * (nm1 * nm1 / d1 + g * p * w1 * sinm) / d1 + n1[7] * (nm1 * nm1 / d2 - g * p * w1 * sinm) / d2 + - n2[6] * (nm3 * nm3 / d3 + g * q * w2 * sinm) / d3 + n2[4] * (nm3 * nm3 / d4 - g * q * w2 * sinm) / d4; - m += grad / h; - } - cpg[ix].m = m1; - } else { - // Only the C+ has an estimate of methylation - double m1 = g1->cmeth[ix1]; - cpg[ix].m = m1; - } - } else if(ix2 >= 0) { - // Only the C- has an estimate of methylation - double m1 = g2->cmeth[ix2]; - cpg[ix].m = m1; - } else { - // No valud esetimates on either strand - cpg[ix].m = -1.0; - } - } -} - diff --git a/tools/gemBS_plugins/compress.c b/tools/gemBS_plugins/compress.c deleted file mode 100644 index 1c3e24fa..00000000 --- a/tools/gemBS_plugins/compress.c +++ /dev/null @@ -1,85 +0,0 @@ -/* -* compress.c -* -* Created on: 15 Sep 2016 -* Author: heath -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "compress.h" -#include "utils.h" - -static pthread_mutex_t compress_lock; -static struct compress compress_data; - -static void free_compress(void) { - int i, j; - - if (compress_data.initialized) { - pthread_mutex_lock(&compress_lock); - if (compress_data.initialized) { - for (i = 0; i < COMPRESS_NONE; i++) { - free(compress_data.compress_suffix[i]); - for (j = 0; j < 2; j++) - if (compress_data.comp_path[i][j]) - free(compress_data.comp_path[i][j]); - } - compress_data.initialized = false; - } - pthread_mutex_unlock(&compress_lock); - } -} - -static void init_compress(void) { - int i, j; - char *pnames[][2] = { - {"bgzip", NULL}, {"gzip", NULL}, {"bzip2", NULL}, {"xz", NULL}, {"compress", NULL}, {NULL, NULL}}; - int compress_type[] = {COMPRESS_GZIP, COMPRESS_GZIP, COMPRESS_BZIP2, COMPRESS_XZ, COMPRESS_COMPRESS, COMPRESS_NONE}; - char *suff[] = {"gz", "bz2", "xz", "Z"}; - char *path; - - if (!compress_data.initialized) { - pthread_mutex_lock(&compress_lock); - errno = 0; - if (!compress_data.initialized) { - (void)setlocale(LC_ALL, ""); - if (!(path = getenv("PATH"))) - path = DEFAULT_PATH; - for (i = 0; i < COMPRESS_NONE; i++) { - compress_data.compress_suffix[i] = strdup(suff[i]); - compress_data.comp_path[i][0] = compress_data.comp_path[i][1] = NULL; - } - int ix = 0; - while(pnames[ix][0] != NULL) { - i = compress_type[ix]; - if(compress_data.comp_path[i][0] == NULL) { - for (j = 0; j < 2; j++) - compress_data.comp_path[i][j] = pnames[ix][j] ? find_prog(pnames[ix][j], path) : NULL; - } - ix++; - } - for (i = 0; i < COMPRESS_NONE; i++) if (compress_data.comp_path[i][0] != NULL) break; - compress_data.default_compress = i; - if (atexit(free_compress)) - fprintf(stderr, "Warning: Unable to register exit function free_compress()\n"); - compress_data.initialized = true; - } - errno = 0; - pthread_mutex_unlock(&compress_lock); - } -} - -struct compress* get_compress_data(void) { - init_compress(); - return &compress_data; -} diff --git a/tools/gemBS_plugins/compress.h b/tools/gemBS_plugins/compress.h deleted file mode 100644 index 49656903..00000000 --- a/tools/gemBS_plugins/compress.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef COMPRESS_H_ -#define COMPRESS_H_ - -#define COMPRESS_GZIP 0 -#define COMPRESS_BZIP2 1 -#define COMPRESS_XZ 2 -#define COMPRESS_COMPRESS 3 -#define COMPRESS_NONE 4 - -struct compress { - char *comp_path[COMPRESS_NONE][2]; - char *compress_suffix[COMPRESS_NONE]; - int default_compress; - bool initialized; -}; - -struct compress* get_compress_data(void); - -#endif /* COMPRESS_H */ diff --git a/tools/gemBS_plugins/mextr.c b/tools/gemBS_plugins/mextr.c deleted file mode 100644 index c4ce9516..00000000 --- a/tools/gemBS_plugins/mextr.c +++ /dev/null @@ -1,591 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mextr.h" -#include "utils.h" -#include "compress.h" - -static FILE *open_ofile(char *name, int compress, bool append) { - FILE *fp = NULL; - if(append) compress = 0; - int comp_ix = COMPRESS_NONE; - if(compress != 0) { - for(comp_ix = 0; comp_ix < COMPRESS_NONE; comp_ix++) { - if(compress & (1 << comp_ix)) break; - } - } - struct compress *cdata = comp_ix < COMPRESS_NONE ? get_compress_data() : NULL; - if(name != NULL) { - if(comp_ix < COMPRESS_NONE) { - char *tname = name; - char *suffix = cdata->compress_suffix[comp_ix]; - // Check whether file name already has suffix - char *p = strrchr(tname, '.'); - if(p == NULL || strcmp(p + 1, suffix)) { - // No, so we will have to add it - tname = malloc(strlen(name) + strlen(suffix) + 2); - sprintf(tname, "%s.%s", name, suffix); - } - int i = child_open(WRITE, tname, cdata->comp_path[comp_ix][0]); - fp = fdopen(i, "w"); - if(name != tname) free(tname); - } else { - fp = append ? fopen(name, "a") : fopen(name, "w"); - } - } else { - if(isatty(fileno(stdout))) comp_ix = COMPRESS_NONE; - if(comp_ix < COMPRESS_NONE) { - int i = child_open(WRITE, NULL, cdata->comp_path[comp_ix][0]); - fp = fdopen(i, "w"); - } else fp = stdout; - } - return fp; -} - -static void init_stats(args_t *a) { - a->stats = calloc((size_t)1, sizeof(stats_t)); -} - -static void write_stats(args_t *a) { - if(a->stats != NULL) { - a->reportfile = a->reportfilename == NULL ? NULL : fopen(a->reportfilename, "w"); - if(a->reportfile != NULL) { - FILE *fp = a->reportfile; - stats_t *st = a->stats; - fprintf(fp,"{\n\t\"TotalSites\": %" PRIu64 ",\n", st->n_sites); - fprintf(fp,"\t\"SitesPassed\": %" PRIu64 "\n", st->n_sites_pass); - fputs("}\n", fp); - fclose(fp); - } - } -} - -static gt_meth *sample_gt[2]; -static cpg_prob *sample_cpg; -static double *sample_Q[3]; - -static void init_files(args_t *a) { - if(a->cpgfilename == NULL) a->cpgfile = NULL; - else if(a->cpgfilename[0] == '-' && a->cpgfilename[1] == 0) a->cpgfile = open_ofile(NULL, 0, false); - else a->cpgfile = open_ofile(a->cpgfilename, a->compress, a->append_mode); - if(a->wigfilename == NULL) a->wigfile = NULL; - else if(a->wigfilename[0] == '-' && a->wigfilename[1] == 0) a->wigfile = open_ofile(NULL, 0, false); - else a->wigfile = open_ofile(a->wigfilename, a->compress, a->append_mode); - - a->noncpgfile = a->noncpgfilename == NULL ? (a->output_noncpg ? a->cpgfile : NULL) : open_ofile(a->noncpgfilename, a->compress, a->append_mode); - if(a->bedmethyl != NULL) { - char *p = strrchr(a->bedmethyl, '.'); - if(p && !strcmp(".bed",p)) { - *p = 0; - } - p = strrchr(a->bedmethyl, '_'); - if(p && !strcmp("_cpg",p)) { - *p = 0; - } - size_t l = strlen(a->bedmethyl); - p = malloc((l + 9) * 3); - a->bedmethylnames[0] = p; - a->bedmethylnames[1] = p + l + 9; - a->bedmethylnames[2] = p + 2 * (l + 9); - sprintf(a->bedmethylnames[BEDMETHYL_CPG], "%s_cpg.bed", a->bedmethyl); - sprintf(a->bedmethylnames[BEDMETHYL_CHG], "%s_chg.bed", a->bedmethyl); - sprintf(a->bedmethylnames[BEDMETHYL_CHH], "%s_chh.bed", a->bedmethyl); - for(int i = 0; i < 3; i++) - a->bedmethylfiles[i] = open_ofile(a->bedmethylnames[i], a->compress, a->append_mode); - } -} - -static void print_file_header(FILE *fp, int ns, char **names) { - if(fp != NULL) { - fputs("Contig\tPos0\tPos1\tRef", fp); - for(int i = 0; i < ns; i++) { - char *name = names[i]; - fprintf(fp, "\t%s:Call\t%s:Flags\t%s:Meth\t%s:non_conv\t%s:conv\t%s:support_call\t%s:total", name, name, name, name, name, name, name); - } - fputc('\n', fp); - } -} - -char *copy_and_strip_quotes(char *s) { - if(!s) return s; - size_t l = strlen(s); - if(l > 1) { - if((s[0] == '\"' && s[l-1] =='\"') || (s[0] == '\'' && s[l-1] =='\'')) { - (s++)[--l] = 0; - } - } - char *s1 = malloc(l + 1); - if(s1 != NULL) memcpy(s1, s, l + 1); - return s1; -} - -static void print_bedmethyl_headers(args_t *args) { - if(args->bedmethyl_track_line == NULL) { - char *sample_name = NULL; - char *sample_desc = NULL; - char *sample_bc = NULL; - // Try and get sample info from VCF file headers - bcf_hdr_t *h = args->hdr; - for(int i = 0; i < h->nhrec; i++) { - bcf_hrec_t *hr = h->hrec[i]; - if(hr->type == BCF_HL_STR) { - if(!strcmp(hr->key, "bs_call_sample_info")) { - int ix = bcf_hrec_find_key(hr, "ID"); - if(ix >= 0) { - sample_bc = copy_and_strip_quotes(hr->vals[ix]); - ix = bcf_hrec_find_key(hr, "SM"); - if(ix >= 0) sample_name = copy_and_strip_quotes(hr->vals[ix]); - ix = bcf_hrec_find_key(hr, "DS"); - if(ix >= 0) sample_desc = copy_and_strip_quotes(hr->vals[ix]); - } - } - } - } - if(sample_name == NULL) sample_name = strdup(h->samples[0]); - if(sample_desc == NULL) sample_desc = strdup(sample_name); - for(bedmethyl_type t = BEDMETHYL_CPG; t <= BEDMETHYL_CHH; t++) { - FILE *fp = args->bedmethylfiles[t]; - if(fp != NULL) { - fprintf(fp, "track name=\"%s\" description=\"%s\" visibility=2 itemRgb=\"On\"\n", sample_desc, sample_name); - } - } - if(sample_bc) free(sample_bc); - if(sample_name) free(sample_name); - args->bedmethyl_desc = sample_desc; - } else { - for(bedmethyl_type t = BEDMETHYL_CPG; t <= BEDMETHYL_CHH; t++) { - char *line = args->bedmethyl_track_line; - size_t l = strlen(line); - if(l > 1 && line[l - 1] == '\n') line[--l] = 0; - if(!strncmp(line, "track ", 6)) line += 6; - FILE *fp = args->bedmethylfiles[t]; - if(fp != NULL) fprintf(fp, "track %s\n", line); - } - } -} - -static void print_headers(args_t *args) { - int ns = bcf_hdr_nsamples(args->hdr); - if(args->cpgfile) print_file_header(args->cpgfile, ns, args->hdr->samples); - if(args->output_noncpg && args->noncpgfile != args->cpgfile) - print_file_header(args->noncpgfile, ns, args->hdr->samples); - print_bedmethyl_headers(args); -} - -static void close_files(args_t *a) { - if(a->cpgfile != NULL && a->cpgfile != stdout) fclose(a->cpgfile); - if(a->noncpgfile != NULL) fclose(a->noncpgfile); - if(a->wigfile != NULL && a->wigfile != stdout) fclose(a->wigfile); - for(int i = 0; i < 3; i++) - if(a->bedmethylfiles[i] != NULL) fclose(a->bedmethylfiles[i]); - while(waitpid(-1, NULL, 0) > 0); -} - -static args_t args = { - .hdr = NULL, - .cpgfile = NULL, - .noncpgfile = NULL, - .wigfile = NULL, - .reportfile = NULL, - .cpgfilename = NULL, - .wigfilename = NULL, - .bedmethylfiles = {NULL, NULL, NULL}, - .noncpgfilename = NULL, - .reportfilename = NULL, - .bedmethyl = NULL, - .bedmethylnames = {NULL, NULL, NULL}, - .bedmethyl_track_line = NULL, - .bedmethyl_desc = ".", - .stats = NULL, - .min_prop = 0.0, - .min_num = 1, - .min_inform = 0, - .min_nc = 1, - .ref_bias = DEFAULT_REF_BIAS, - .under_conv = DEFAULT_UNDER_CONV, - .over_conv = DEFAULT_OVER_CONV, - .bq_thresh = DEFAULT_BQ_THRESH, - .mq_thresh = DEFAULT_MAPQ_THRESH, - .mode = CPGMODE_COMBINED, - .sel_mode = SELECT_HOM, - .sel_thresh = DEFAULT_SELECT_THRESH, - .compress = 0, - .common_gt = false, - .output_noncpg = false, - .header = true, - .append_mode = false -}; - -const char *about(void) -{ - return "Extract CpG and nonCpG sites.\n"; -} - -const char *usage(void) -{ - return - "\n" - "About: Extract CpG and nonCpG sites.\n" - "Usage: bcftools +mextr [General Options] -- [Plugin Options]\n" - "Options:\n" - " run \"bcftools plugin\" for a list of common options\n" - "\n" - "Plugin options:\n" - " -o, --cpgfile Output file for CpG sites (default = stdout)\n" - " -n, --noncpgfile Output file for nonCpG sites (default, not output)\n" - " -b. --bed-methyl Output file base for bedMethly files. Not compatible with multi-sample files (default, not output)\n" - " -w. --wigfile Output file for wig file (methylation)\n" - " -t. --bed-track-line Track line for for bedMethly files (default, info taken from input VCF file)\n" - " -r, --report-file Output file for JSON report (default, not output)\n" - " -H, --no_header Do not print header line(s) in output file(s) (default, false)\n" - " -g, --common-gt Recall genotypes assuming common genotypes across samples\n" - " -m, --mode Output mode for CpG sites\n" - " combined Generate one line per CpG with combined estimates (default)\n" - " strand-specific Generate two lines per CpG with the strand specific estimates\n" - " -s, --select Select mode for sites/CpGs\n" - " hom Select on homozygote sites/CpGs (default)\n" - " het Select on heterozygote sites/CpGs\n" - " -R, --reference-bias Reference bias for re-calling (default 2)\n" - " -M, --min-nc Minimum number of non-converted bases for non CpG site (default 1)\n" - " -p, --prop Minimum proportion of sites/CpGs that must pass (default 0.0)\n" - " -N, --number Minimum number of sites/CpGs that must pass (default 1)\n" - " -I, --inform Minimum number of informative reads for a CpG/site to pass (default 1)\n" - " -T, --threshold Phred scaled threshold probability of selecting sites/CpGs (default 20)\n" - " -c, --conversion , set under and over conversion rates\n" - " -Q, --bq-threshold Base qality threshold used for calling\n" - " -z, --gzip Compress output with gzip (bgzip if available)\n" - " -j, --bzip2 Compress output with bzip2\n" - " -x, --xz Compress output with xz\n" - " -a, --append Append to output files rather than create new ones. Not compatible wih output compression\n" - "\n" - "Example:\n" - " bcftools +mextr in.vcf -- -o out_cpg.txt -n out_noncpg.txt -z\n" - "\n"; -} - -// Try to parse the paramaters used for bs_call from the headers -static void check_hdr_params(args_t *a) { - char *par[] = {"under_conversion", "over_conversion", "mapq_thresh", "bq_thresh", NULL}; - bcf_hdr_t *h = a->hdr; - for(int i = 0; i < h->nhrec; i++) { - bcf_hrec_t *hr = h->hrec[i]; - if(hr->type == BCF_HL_GEN) { - if(!strcmp(hr->key, "source") && !strncmp(hr->value, "bs_call", 7)) { - char *p = strchr(hr->value, ','); - while(p != NULL) { - p++; - int ix; - for(ix = 0; par[ix] != NULL; ix++) if(!strncmp(p, par[ix], strlen(par[ix]))) break; - if(par[ix] != NULL) { - char *p1 = strchr(p, '='); - if(p1) { - switch(ix) { - case 0: - a->under_conv = strtod(p1 + 1, &p); - break; - case 1: - a->over_conv = strtod(p1 + 1, &p); - break; - case 2: - a->mq_thresh = (int)strtol(p1 + 1, &p, 10); - break; - case 3: - a->bq_thresh = (int)strtol(p1 + 1, &p, 10); - break; - } - } - } - p = strchr(p, ','); - } - } - } - } -} - -int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out __unused__) -{ - args.hdr = in; - check_hdr_params(&args); - static struct option loptions[] = { - {"cpgfile",required_argument,0,'o'}, - {"wigfile",required_argument,0,'w'}, - {"noncpgfile",required_argument,0,'n'}, - {"bed-methyl",required_argument,0,'b'}, - {"bed-track-line",required_argument,0,'t'}, - {"report-file",required_argument,0,'r'}, - {"no_header",no_argument,0,'H'}, - {"common-gt",no_argument,0,'g'}, - {"mode",required_argument,0,'m'}, - {"select",required_argument,0,'s'}, - {"prop",required_argument,0,'p'}, - {"min-nc",required_argument,0,'M'}, - {"reference-bias",required_argument,0,'R'}, - {"number",required_argument,0,'N'}, - {"inform",required_argument,0,'I'}, - {"threshold",required_argument,0,'T'}, - {"conversion",required_argument,0,'c'}, - {"bq-conversion",required_argument,0,'Q'}, - {"gzip",no_argument,0,'z'}, - {"bzip2",no_argument,0,'j'}, - {"xz",no_argument,0,'x'}, - {"append",no_argument,0,'a'}, - {0,0,0,0} - }; - int c; - bool mult_comp = false; - while ((c = getopt_long(argc, argv, "?Qh:o:c:b:n:r:m:R:M:I:s:p:N:T:t:w:gzHjxa",loptions,NULL)) >= 0) { - switch (c) { - case 'a': - args.append_mode = true; - break; - case 'o': - args.cpgfilename = optarg; - break; - case 'w': - args.wigfilename = optarg; - break; - case 'n': - args.noncpgfilename = optarg; - args.output_noncpg = true; - break; - case 'r': - args.reportfilename = optarg; - break; - case 'R': - args.ref_bias = atof(optarg); - break; - case 'H': - args.header = false; - break; - case 'g': - args.common_gt = true; - break; - case 's': - if(!strcasecmp(optarg, "hom")) args.sel_mode = SELECT_HOM; - else if(!strcasecmp(optarg, "het")) args.sel_mode = SELECT_HET; - else error ("s (select) option can be either 'hom' or 'het'\n"); - break; - case 'm': - if(!strcasecmp(optarg, "combined")) args.mode = CPGMODE_COMBINED; - else if(!strcasecmp(optarg, "strand-specific")) args.mode = CPGMODE_SEPARATE; - else error ("m (mode) option can be either 'combined' or 'strand-specific'\n"); - break; - case 'c': - if (sscanf(optarg, "%lf,%lf", &args.under_conv, &args.over_conv) != 2) - error("c (conversion) option expects two comma separated arguments)\n"); - break; - case 'p': - args.min_prop = atof(optarg); - if(args.min_prop < 0.0) args.min_prop = 0.0; - else if(args.min_prop > 1.0) args.min_prop = 1.0; - break; - case 'T': - args.sel_thresh = atoi(optarg); - if(args.sel_thresh < 0) args.sel_thresh = 0; - else if(args.sel_thresh > 255) args.sel_thresh = 255; - break; - case 'N': - args.min_num = atoi(optarg); - if(args.min_num < 1) args.min_num = 1; - break; - case 'b': - args.bedmethyl = optarg; - break; - case 't': - args.bedmethyl_track_line = optarg; - break; - case 'I': - args.min_inform = atoi(optarg); - if(args.min_inform < 0) args.min_inform = 0; - break; - case 'M': - args.min_nc = atoi(optarg); - if(args.min_nc < 0) args.min_nc = 0; - break; - case 'Q': - args.bq_thresh = atoi(optarg); - break; - case 'z': - if(args.compress) mult_comp = true; - else args.compress |= COMP_GZIP; - break; - case 'j': - if(args.compress) mult_comp = true; - else args.compress |= COMP_BZIP2; - break; - case 'x': - if(args.compress) mult_comp = true; - else args.compress |= COMP_XZ; - break; - case 'h': - case '?': - default: error(usage()); break; - } - } - if(mult_comp) error("Can not combine multiple compression options\n"); - if(args.append_mode && args.compress) error("Output compression not compatible with append mode\n"); - int ns = bcf_hdr_nsamples(args.hdr); - assert(ns > 0); - if((args.bedmethyl || args.wigfile) && ns > 1) error("bedMethyl and wig output not compatible with multi-sample files\n"); - if (optind != argc) error(usage()); - init_files(&args); - if(!args.append_mode && (args.header || args.bedmethyl)) print_headers(&args); - if(args.reportfilename != NULL) init_stats(&args); - sample_gt[0] = malloc(sizeof(gt_meth) * ns * 2); - sample_gt[1] = sample_gt[0] + ns; - sample_Q[0] = malloc(sizeof(double) * (ns + 1) * 2 + ns); - sample_Q[1] = sample_Q[0] + ns + 1; - sample_Q[2] = sample_Q[1] + ns + 1; - if(args.mode == CPGMODE_COMBINED) sample_cpg = malloc(sizeof(cpg_prob) * ns); - fill_base_prob_table(); - return 1; -} - -static fmt_field_t tags[] = { - { "FT", BCF_HT_STR, {{NULL, 0, 0}, {NULL, 0, 0}}}, - { "MC8", BCF_HT_INT, {{NULL, 0, 0}, {NULL, 0, 0}}}, - { "AMQ", BCF_HT_INT, {{NULL, 0, 0}, {NULL, 0, 0}}}, - { "CX", BCF_HT_STR, {{NULL, 0, 0}, {NULL, 0, 0}}}, - { "AQ", BCF_HT_INT, {{NULL, 0, 0}, {NULL, 0, 0}}}, - { "MQ", BCF_HT_INT, {{NULL, 0, 0}, {NULL, 0, 0}}}, - { NULL, 0, {{NULL, 0, 0}, {NULL, 0, 0}}}, -}; - -bcf1_t *process(bcf1_t *rec) -{ - static int idx; - static int32_t curr_rid = -1, prev_pos = -1; - static bcf1_t prev_rec; - - int ns = bcf_hdr_nsamples(args.hdr); - stats_t *st = args.stats; - if(st != NULL) st->n_sites++; - bcf_unpack(rec, BCF_UN_FLT); - int n_all = rec->n_allele; - bool cg = false; - for(int i = 0; i < n_all; i++) { - char c = rec->d.allele[i][0]; - if((c == 'C' || c == 'G') && rec->d.allele[i][1] == 0) { - cg = true; - break; - } - } - if(cg) { // Site with potentially Cs or Gs - bcf_unpack(rec, BCF_UN_ALL); - // Get format tags - for(int ix = 0; tags[ix].tag != NULL; ix++) { - fmt_store_t *s = tags[ix].st + idx; - s->ne = bcf_get_format_values(args.hdr, rec, tags[ix].tag, &s->dat_p, &s->dat_n, tags[ix].type); - } - if(tags[FMT_CX].st[idx].ne > 0 && tags[FMT_MC8].st[idx].ne == ns * 8) { - // Get sample base counts and genotype probs. - int32_t *mc8_p = tags[FMT_MC8].st[idx].dat_p; - int32_t *amq_p = tags[FMT_AMQ].st[idx].dat_p; - int n_amq = tags[FMT_AMQ].st[idx].ne / ns; - int32_t *aq_p = tags[FMT_AQ].st[idx].ne == ns ? tags[FMT_AQ].st[idx].dat_p : NULL; - int32_t *mq_p = tags[FMT_MQ].st[idx].ne == ns ? tags[FMT_MQ].st[idx].dat_p : NULL; - double ms_mq = 0.0; - int32_t tot_n = 0; - for(int i = 0; i < ns; i++) { - int32_t *ct = sample_gt[idx][i].counts; - int32_t *amq = sample_gt[idx][i].aqual; - memset(ct, 0, sizeof(int32_t) * 8); - memset(amq, 0, sizeof(int32_t) * 8); - int32_t x = mc8_p[i * 8]; - int k = 0; - if(x != bcf_int32_missing) { - int k1 = 0; - for(int j = 0; j < 8; j++) { - x = mc8_p[i * 8 + j]; - ct[j] += x; - k += x; - if(x > 0 && amq_p != NULL && k1 < n_amq) { - int q = amq_p[i * n_amq + k1++]; - if(q >= 0) { - if(q > MAX_QUAL) q = MAX_QUAL; - amq[j] = q; - } - } - } - if(amq_p == NULL) { - int q = aq_p == NULL ? args.bq_thresh : aq_p[i]; - if(q > MAX_QUAL) q = MAX_QUAL; - for(int j = 0; j < 8; j++) amq[j] = q; - } - } - if(k > 0) { - if(mq_p != NULL) { - int m = mq_p[i]; - ms_mq += (double)k * (double)(m * m); - } - tot_n += k; - calc_gt_prob(sample_gt[idx] + i, &args, rec->d.allele[0][0]); - sample_gt[idx][i].skip = false; - } else sample_gt[idx][i].skip = true; - } - // If we force a common genotype, calculate prob. distribution for common genotype - if(args.common_gt) { - double gt[10]; - for(int k = 0; k < 10; k++) gt[k] = 0.0; - for(int i = 0; i < ns; i++) { - if(!sample_gt[idx][i].skip) { - for(int k = 0; k < 10; k++) gt[k] += sample_gt[idx][i].gt_prob[k]; - } - } - double max = gt[0]; - int max_gt = 0; - for(int k = 1; k < 10; k++) { - if(gt[k] > max) { - max = gt[k]; - max_gt = k; - } - } - double sum = 0.0; - for(int k = 0; k < 10; k++) sum += exp(gt[k] - max); - sum = log(sum); - for(int k = 0; k < 10; k++) gt[k] -= (max + sum); - for(int i = 0; i < ns; i++) { - if(!sample_gt[idx][i].skip) { - for(int k = 0; k < 10; k++) sample_gt[idx][i].gt_prob[k] = gt[k]; - sample_gt[idx][i].max_gt = max_gt; - sample_gt[idx][i].sum = max + sum; - } - } - } - // Here is the logic for deciding what we print - - // check if we are next to the previous record - bool consec = false; - if(rec->rid != curr_rid) curr_rid = rec->rid; - else if(rec->pos - prev_pos == 1) consec = true; - if(consec) { - output_cpg(&args, &prev_rec, tags, sample_gt, idx ^ 1, sample_cpg, sample_Q); - } else if(args.output_noncpg && prev_pos >= 0) { - output_nonconsec_noncpg(&args, &prev_rec, tags, sample_gt, idx ^ 1, true, sample_cpg, sample_Q); - output_nonconsec_noncpg(&args, rec, tags, sample_gt, idx, false, sample_cpg, sample_Q); - } - if(args.bedmethyl || args.wigfile) { - output_bedmethyl(&args, rec, tags, sample_gt, idx); - } - idx ^= 1; - prev_pos = rec->pos; - memcpy(&prev_rec, rec, sizeof(bcf1_t)); - if(st != NULL) st->n_sites_pass++; - } - } - return NULL; -} - -void destroy(void) -{ - write_stats(&args); - close_files(&args); -} diff --git a/tools/gemBS_plugins/mextr.h b/tools/gemBS_plugins/mextr.h deleted file mode 100644 index 4b8fd9a4..00000000 --- a/tools/gemBS_plugins/mextr.h +++ /dev/null @@ -1,109 +0,0 @@ -#ifndef MEXTR_H_ -#define MEXTR_H_ - -#include -#include -#include -#include -#include -#include - -#define COMP_GZIP (1 << COMPRESS_GZIP) -#define COMP_BZIP2 (1 << COMPRESS_BZIP2) -#define COMP_XZ (1 << COMPRESS_XZ) - -#define LOG10 2.30258509299404568402 - -#define DEFAULT_UNDER_CONV 0.01 -#define DEFAULT_OVER_CONV 0.05 -#define DEFAULT_MAPQ_THRESH 20 -#define DEFAULT_BQ_THRESH 20 -#define DEFAULT_REF_BIAS 2 -#define MAX_QUAL 43 -#define DEFAULT_SELECT_THRESH 20 - -void error(const char *format, ...) HTS_NORETURN; - -typedef struct { - uint64_t n_sites; - uint64_t n_sites_pass; -} stats_t; - -typedef enum {FMT_FT, FMT_MC8, FMT_AMQ, FMT_CX, FMT_AQ, FMT_MQ, FMT_GQ, FMT_GOF, FMT_GL} fmt_tag; -typedef enum {CPGMODE_COMBINED, CPGMODE_SEPARATE} cpg_mode; -typedef enum {SELECT_HOM, SELECT_HET} select_mode; -typedef enum {BEDMETHYL_CPG, BEDMETHYL_CHG, BEDMETHYL_CHH, BEDMETHYL_NONE} bedmethyl_type; - -typedef struct { - bcf_hdr_t *hdr; - char *cpgfilename; - char *noncpgfilename; - char *reportfilename; - char *wigfilename; - char *bedmethyl; - char *bedmethylnames[3]; - char *bedmethyl_track_line; - char *bedmethyl_desc; - FILE *cpgfile; - FILE *noncpgfile; - FILE *wigfile; - FILE *reportfile; - FILE *bedmethylfiles[3]; - stats_t *stats; - cpg_mode mode; - select_mode sel_mode; - int sel_thresh; - int compress; - bool common_gt; - bool output_noncpg; - bool header; - double min_prop; - int min_num; - int min_inform; - int min_nc; - double ref_bias; - double under_conv; - double over_conv; - int bq_thresh; - int mq_thresh; - bool append_mode; -} args_t; - -typedef struct { - void *dat_p; - int dat_n; - int ne; -} fmt_store_t; - -typedef struct { - char *tag; - int type; - fmt_store_t st[2]; -} fmt_field_t; - -typedef struct { - int32_t counts[8]; - int32_t aqual[8]; // Average base quality - double gt_prob[10]; // Genotype log probabilities (Log10) - double cmeth[3], gmeth[3]; - double sum; - uint8_t max_gt; - bool skip; -} gt_meth; - -typedef struct { - double prob_best; - double prob_cg; - uint8_t max_gt[2]; - double m; -} cpg_prob; - -void calc_gt_prob(gt_meth *gt, args_t *args, char rf); -void calc_cpg_meth(args_t *args, int ns, cpg_prob *cpg, gt_meth *g1, gt_meth *g2); -double get_meth(gt_meth *g, int idx); -void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx, cpg_prob *sample_cpg, double *Q[]); -void output_nonconsec_noncpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx, bool first, cpg_prob *sample_cpg, double *Q[]); -void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx); -void fill_base_prob_table(void); - -#endif // MEXTR_H_ diff --git a/tools/gemBS_plugins/mextr.mk b/tools/gemBS_plugins/mextr.mk deleted file mode 100644 index e3a064eb..00000000 --- a/tools/gemBS_plugins/mextr.mk +++ /dev/null @@ -1,2 +0,0 @@ -plugins/mextr.so: plugins/mextr.c version.h version.c ../gemBS_plugins/utils.c ../gemBS_plugins/output.c ../gemBS_plugins/compress.c ../gemBS_plugins/calc_gt_prob.c ../gemBS_plugins/compress.h ../gemBS_plugins/utils.h ../gemBS_plugins/mextr.h - $(CC) --std=gnu11 $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) -I ../gemBS_plugins $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ ../gemBS_plugins/utils.c ../gemBS_plugins/output.c ../gemBS_plugins/compress.c ../gemBS_plugins/calc_gt_prob.c version.c $< $(LIBS) diff --git a/tools/gemBS_plugins/output.c b/tools/gemBS_plugins/output.c deleted file mode 100644 index e8548d5a..00000000 --- a/tools/gemBS_plugins/output.c +++ /dev/null @@ -1,407 +0,0 @@ -#include -#include -#include -#include - -#include "mextr.h" - -int calc_phred(double z) { - int phred; - if(z <= 0.0) phred = 255; - else { - phred = (int)(-10.0 * log(z) / LOG10); - if(phred > 255) phred = 255; - } - return phred; -} - -static double *get_prob_dist(int ns, double *Q[]) { - // Build up prob. distribution Q(i) where Q(i) = prob that i samples have genotype CG/CG - double *p = Q[2]; - double *q0 = Q[0]; - double *q1 = Q[1]; - q0[0] = 1.0; - for(int ix = 0; ix < ns; ix++) { - double z = p[ix]; - q1[0] = q0[0] * (1.0 - z); - for(int k = 1; k <= ix; k++) q1[k] = q0[k - 1] * z + q0[k] * (1.0 - z); - q1[ix + 1] = q0[ix] * z; - double *t = q0; - q0 = q1; - q1 = t; - } - return q0; -} - -static char trans_base[256] = { - ['A'] = 'T', ['C'] = 'G', ['G'] = 'C', ['T'] = 'A', - ['Y'] = 'R', ['R'] = 'Y', ['S'] = 'S', ['W'] = 'W', ['K'] = 'M', ['M'] = 'K', - ['B'] = 'V', ['V'] = 'B', ['D'] = 'H', ['H'] = 'D', ['N'] = 'N', ['.'] = '.' -}; - -void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx, cpg_prob *cpg, double *Q[]) { - static char *cx; - static int32_t cx_n; - static char *gt_iupac = "AMRWCSYGKT"; - static uint8_t gt_msk[] = {0x11, 0xb3, 0x55, 0x99, 0xa2, 0xf6, 0xaa, 0x54, 0xdc, 0x88}; - - FILE *fp = args->cpgfile; - int ns = bcf_hdr_nsamples(args->hdr); - int min_n = args->min_num; - int n1 = (int)(args->min_prop * (double)ns + 0.5); - if(n1 > min_n) min_n = n1; - if(fp != NULL) { - // Build up prob. distribution Q(i) where Q(i) = prob that i samples have genotype CG/CG - bool skip = true; - for(int ix = 0; ix < ns; ix++) { - gt_meth *g1 = sample_gt[idx]+ix, *g2 =sample_gt[idx ^ 1]+ix; - double z = 0.0; - if(!(g1->skip || g2->skip)) { - if((g1->counts[5] + g1->counts[7] >= args->min_inform) || (g2->counts[6] + g1->counts[4] >= args->min_inform)) { - if(args->sel_mode == SELECT_HOM) { - z = exp(g1->gt_prob[4] + g2->gt_prob[7]); - if(g1->max_gt == 4 && g2->max_gt == 7) skip = false; - } else { - z = (exp(g1->gt_prob[1]) + exp(g1->gt_prob[4]) + exp(g1->gt_prob[5]) + exp(g1->gt_prob[6])) * - (exp(g2->gt_prob[2]) + exp(g2->gt_prob[5]) + exp(g2->gt_prob[7]) + exp(g2->gt_prob[8])); - if((g1->max_gt == 1 || (g1->max_gt >= 4 && g1->max_gt <= 6)) && - (g2->max_gt == 2 || g2->max_gt == 5 || g2->max_gt == 7 || g2->max_gt == 8)) skip = false; - } - } - } - Q[2][ix] = z; - } - double *p = get_prob_dist(ns, Q); - double z = p[0]; - for(int i = 1; i <= ns && i < min_n; i++) z += p[i]; - int phred = calc_phred(z); - if(!skip && phred >= args->sel_thresh) { - int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)&cx, &cx_n, BCF_HT_STR); - int cx_sz = tags[FMT_CX].st[idx].ne / ns; - if(args->mode == CPGMODE_COMBINED) { - calc_cpg_meth(args, ns, cpg, sample_gt[idx], sample_gt[idx ^ 1]); - fprintf(fp,"%s\t%" PRId64 "\t%" PRId64 "\t%.2s", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 2, cx_len >= 5 ? cx + 2 : "."); - char *cx_p = tags[FMT_CX].st[idx].dat_p; - int *mq_p1 = tags[FMT_MQ].st[idx].ne == ns ? tags[FMT_MQ].st[idx].dat_p : NULL; - int *mq_p2 = tags[FMT_MQ].st[idx^1].ne == ns ? tags[FMT_MQ].st[idx^1].dat_p : NULL; - for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { - gt_meth *g1 = sample_gt[idx]+ix, *g2 =sample_gt[idx ^ 1]+ix; - if(!(g1->skip || g2->skip)) { - int gq = calc_phred(1.0 - exp(g1->gt_prob[g1->max_gt] + g2->gt_prob[g2->max_gt])); // Prob. of not being called genotype - fprintf(fp, "\t%c%c\tGQ=%d", gt_iupac[g1->max_gt], gt_iupac[g2->max_gt], gq); - if(g1->max_gt != 4 || g2->max_gt != 7) { - int dq = calc_phred(exp(g1->gt_prob[4] + g2->gt_prob[7])); // Prob. of being CG - fprintf(fp, ";DQ=%d", dq); - } - int mq = -1; - if(mq_p1 != NULL) { - if(mq_p2 != NULL) { - double n1 = 0.0, n2 = 0.0; - for(int k = 0; k < 8; k++) { - n1 += (double)g1->counts[k]; - n2 += (double)g2->counts[k]; - } - if(n1 + n2 > 0.0) { - double mq1 = (double)mq_p1[ix]; - double mq2 = (double)mq_p2[ix]; - mq = (int32_t)(0.5 + sqrt((mq1 * mq1 * n1 + mq2 * mq2 + n2) / (n1 + n2))); - } - } else mq = mq_p1[ix]; - } else if(mq_p2 != NULL) mq = mq_p2[ix]; - if(mq >= 0) fprintf(fp, ";MQ=%d", mq); - int32_t ct[4]; - ct[0] = g1->counts[5] + g2->counts[6]; - ct[1] = g1->counts[7] + g2->counts[4]; - ct[2] = ct[3] = 0; - uint8_t m = 1; - uint8_t msk1 = gt_msk[g1->max_gt]; - uint8_t msk2 = gt_msk[g2->max_gt]; - for(int i = 0; i < 8; i++, m <<= 1) { - ct[3] += g1->counts[i] + g2->counts[i]; - if(msk1 & m) ct[2] += g1->counts[i]; - if(msk2 & m) ct[2] += g2->counts[i]; - } - fprintf(fp, "\t%.3f\t%d\t%d\t%d\t%d", cpg[ix].m, ct[0], ct[1], ct[2], ct[3]); - } else { - fputs("\t.\t.\t.\t.\t.\t.\t.", fp); - } - } - fputc('\n', fp); - } else { - for(int pos = 0; pos < 2; pos++) { - int *mq_p = tags[FMT_MQ].st[idx ^ pos].ne == ns ? tags[FMT_MQ].st[idx ^ pos].dat_p : NULL; - fprintf(fp,"%s\t%" PRId64 "\t%" PRId64 " \t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); - char *cx_p = tags[FMT_CX].st[idx].dat_p; - for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { - gt_meth *g = sample_gt[idx ^ pos]+ix; - if(!g->skip) { - int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype - fprintf(fp, "\t%c\tGQ=%d", gt_iupac[g->max_gt], gq); - if(g->max_gt != (pos ? 7 : 4)) { - int dq = calc_phred(exp(g->gt_prob[pos ? 7 : 4])); // Prob. of being CG - fprintf(fp, ";DQ=%d", dq); - } - int mq = -1; - if(mq_p != NULL) mq = mq_p[ix]; - if(mq >= 0) fprintf(fp, ";MQ=%d", mq); - int32_t ct[4]; - if(pos) { - ct[0] = g->counts[6]; - ct[1] = g->counts[4]; - } else { - ct[0] = g->counts[5]; - ct[1] = g->counts[7]; - } - ct[2] = ct[3] = 0; - uint8_t m = 1; - uint8_t msk = gt_msk[g->max_gt]; - for(int i = 0; i < 8; i++, m <<= 1) { - ct[3] += g->counts[i]; - if(msk & m) ct[2] += g->counts[i]; - } - double meth = get_meth(g, pos); - fprintf(fp, "\t%.3f\t%d\t%d\t%d\t%d", meth, ct[0], ct[1], ct[2], ct[3]); - } else { - fputs("\t.\t.\t.\t.\t.\t.\t.", fp); - } - } - fputc('\n', fp); - } - } - } - } - if(args->output_noncpg) { - fp=args->noncpgfile; - assert(fp != NULL); - for(int pos = 0; pos < 2; pos++) { - for(int ix = 0; ix < ns; ix++) { - double z = 0.0; - gt_meth *g = sample_gt[idx ^ pos] + ix; - if(!g->skip) { - if(!pos) { - if(g->counts[5] >= args->min_nc && (g->counts[5] + g->counts[7] >= args->min_inform)) { - if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[4]); - else z = exp(g->gt_prob[1]) + exp(g->gt_prob[4]) + exp(g->gt_prob[5]) + exp(g->gt_prob[6]); - gt_meth *g2 = sample_gt[idx ^ 1] + ix; - z *= 1.0 - (exp(g2->gt_prob[2]) + exp(g2->gt_prob[5]) + exp(g2->gt_prob[7]) + exp(g2->gt_prob[8])); - } - } else { - if(g->counts[6] >= args->min_nc && (g->counts[6] + g->counts[4] >= args->min_inform)) { - if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[7]); - else z = exp(g->gt_prob[2]) + exp(g->gt_prob[5]) + exp(g->gt_prob[7]) + exp(g->gt_prob[8]); - gt_meth *g2 = sample_gt[idx] + ix; - z *= 1.0 - (exp(g2->gt_prob[1]) + exp(g2->gt_prob[4]) + exp(g2->gt_prob[5]) + exp(g2->gt_prob[6])); - } - } - } - Q[2][ix] = z; - } - double *p = get_prob_dist(ns, Q); - double z = p[0]; - for(int i = 1; i <= ns && i < min_n; i++) z += p[i]; - int phred = calc_phred(z); - if(phred >= args->sel_thresh) { - int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)&cx, &cx_n, BCF_HT_STR); - int cx_sz = tags[FMT_CX].st[idx ^ pos].ne / ns; - int *mq_p = tags[FMT_MQ].st[idx ^ pos].ne == ns ? tags[FMT_MQ].st[idx ^ pos].dat_p : NULL; - fprintf(fp,"%s\t%"PRId64"\t%"PRId64"\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); - char *cx_p = tags[FMT_CX].st[idx ^ pos].dat_p; - for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { - gt_meth *g = sample_gt[idx ^ pos] + ix; - if(!g->skip) { - int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype - fprintf(fp, "\t%c\tGQ=%d", gt_iupac[g->max_gt], gq); - if(g->max_gt != (pos ? 7 : 4)) { - int dq = calc_phred(exp(g->gt_prob[pos ? 7 : 4])); // Prob. of being CG - fprintf(fp, ";DQ=%d", dq); - } - int mq = -1; - if(mq_p != NULL) mq = mq_p[ix]; - if(mq >= 0) fprintf(fp, ";MQ=%d", mq); - if(!pos) { - if(cx_sz >= 5) fprintf(fp, ";CX=%.3s", cx_p + 2); - } else if(cx_sz >= 3) { - char tmp[3]; - tmp[2] = trans_base[(int)cx_p[0]]; - tmp[1] = trans_base[(int)cx_p[1]]; - tmp[0] = trans_base[(int)cx_p[2]]; - fprintf(fp, ";CX=%.3s", tmp); - } - int32_t ct[4]; - if(pos) { - ct[0] = g->counts[6]; - ct[1] = g->counts[4]; - } else { - ct[0] = g->counts[5]; - ct[1] = g->counts[7]; - } - ct[2] = ct[3] = 0; - uint8_t m = 1; - uint8_t msk = gt_msk[g->max_gt]; - for(int i = 0; i < 8; i++, m <<= 1) { - ct[3] += g->counts[i]; - if(msk & m) ct[2] += g->counts[i]; - } - double meth = get_meth(g, pos); - fprintf(fp, "\t%g\t%d\t%d\t%d\t%d", meth, ct[0], ct[1], ct[2], ct[3]); - } else { - fputs("\t.\t.\t.\t.\t.\t.\t.\t.", fp); - } - } - fputc('\n', fp); - } - } - } -} - -void output_nonconsec_noncpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx, bool first, cpg_prob *cpg, double *Q[]) { - static char *cx; - static int32_t cx_n; - static char *gt_iupac = "AMRWCSYGKT"; - static uint8_t gt_msk[] = {0x11, 0xb3, 0x55, 0x99, 0xa2, 0xf6, 0xaa, 0x54, 0xdc, 0x88}; - - FILE *fp = args->cpgfile; - int ns = bcf_hdr_nsamples(args->hdr); - int min_n = args->min_num; - int n1 = (int)(args->min_prop * (double)ns + 0.5); - if(n1 > min_n) min_n = n1; - fp=args->noncpgfile; - assert(fp != NULL); - for(int ix = 0; ix < ns; ix++) { - double z = 0.0; - gt_meth *g = sample_gt[idx] + ix; - if(!g->skip) { - if(first) { - if(g->counts[5] >= args->min_nc && (g->counts[5] + g->counts[7] >= args->min_inform)) { - if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[4]); - else z = exp(g->gt_prob[1]) + exp(g->gt_prob[4]) + exp(g->gt_prob[5]) + exp(g->gt_prob[6]); - } - } else { - if(g->counts[6] >= args->min_nc && (g->counts[6] + g->counts[4] >= args->min_inform)) { - if(args->sel_mode == SELECT_HOM) z = exp(g->gt_prob[7]); - else z = exp(g->gt_prob[2]) + exp(g->gt_prob[5]) + exp(g->gt_prob[7]) + exp(g->gt_prob[8]); - } - } - } - Q[2][ix] = z; - } - double *p = get_prob_dist(ns, Q); - double z = p[0]; - for(int i = 1; i <= ns && i < min_n; i++) z += p[i]; - int phred = calc_phred(z); - if(phred >= args->sel_thresh) { - int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)&cx, &cx_n, BCF_HT_STR); - int cx_sz = tags[FMT_CX].st[idx].ne / ns; - int *mq_p = tags[FMT_MQ].st[idx].ne == ns ? tags[FMT_MQ].st[idx].dat_p : NULL; - fprintf(fp,"%s\t%"PRId64"\t%"PRId64"\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, cx_len >= 3 ? cx[2] : '.'); - char *cx_p = tags[FMT_CX].st[idx].dat_p; - for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { - gt_meth *g = sample_gt[idx] + ix; - if(!g->skip) { - int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype - fprintf(fp, "\t%c\tGQ=%d", gt_iupac[g->max_gt], gq); - if(g->max_gt != (first ? 4 : 7)) { - int dq = calc_phred(exp(g->gt_prob[first ? 4 : 7])); // Prob. of being CG - fprintf(fp, ";DQ=%d", dq); - } - int mq = -1; - if(mq_p != NULL) mq = mq_p[ix]; - if(mq >= 0) fprintf(fp, ";MQ=%d", mq); - if(cx_sz >= 5) fprintf(fp, ";CX=%.3s", cx_p + 2); - int32_t ct[4]; - if(!first) { - ct[0] = g->counts[6]; - ct[1] = g->counts[4]; - } else { - ct[0] = g->counts[5]; - ct[1] = g->counts[7]; - } - ct[2] = ct[3] = 0; - uint8_t m = 1; - uint8_t msk = gt_msk[g->max_gt]; - for(int i = 0; i < 8; i++, m <<= 1) { - ct[3] += g->counts[i]; - if(msk & m) ct[2] += g->counts[i]; - } - double meth = get_meth(g, !first); - fprintf(fp, "\t%g\t%d\t%d\t%d\t%d", meth, ct[0], ct[1], ct[2], ct[3]); - } else { - fputs("\t.\t.\t.\t.\t.\t.\t.\t.", fp); - } - } - fputc('\n', fp); - } -} - -static char *rgb_tab[11] = { "0,255,0", "55,255,0", "105,255,0", "155,255,0", "205,255,0", "255,255,0", - "255,205,0", "255,155,0", "255,105,0", "255,55,0", "255,0,0" }; - -void output_bedmethyl(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt[], int idx) { - static char *cx; - static int32_t cx_n,old_rid = 0xffffffff, old_pos = -1; - - if(rec->rid == old_rid && rec->pos <= old_pos) return; - int ns = bcf_hdr_nsamples(args->hdr); - if(ns > 1) return; - gt_meth *g = sample_gt[idx]; - if(!g->skip) { - int cx_len = bcf_get_info_values(args->hdr, rec, "CX", (void **)&cx, &cx_n, BCF_HT_STR); - if(cx_len < 3) return; - char strand; - if(cx[2] == 'C') strand = '+'; - else if(cx[2] == 'G') strand = '-'; - else return; - if(strand == '+' && cx_len < 5) return; - char *cx_p = tags[FMT_CX].st[idx].dat_p; - int cx_sz = tags[FMT_CX].st[idx].ne; - char rtmp[8]; - if(strand == '+') { - int k; - for(k = 0; k < 3; k++) rtmp[k] = cx[k + 2]; - for(k = 0; k < 3 && k < cx_sz - 2; k++) rtmp[k + 4] = cx_p[k + 2]; - for(;k < 3; k++) rtmp[k + 4] = 'N'; - } else { - int k; - for(k = 0; k < 3; k++) rtmp[2 - k] = trans_base[(int)cx[k]]; - for(k = 0; k < 3 && k < cx_sz; k++) rtmp[6 - k] = trans_base[(int)cx_p[k]]; - for(;k < 3; k++) rtmp[6 - k] = 'N'; - } - bedmethyl_type btype = BEDMETHYL_NONE; - assert(rtmp[0] == 'C'); - if(rtmp[1] == 'G') { - btype = BEDMETHYL_CPG; - rtmp[2] = rtmp[6] = 0; - } else { - btype = rtmp[2] == 'G' ? BEDMETHYL_CHG : BEDMETHYL_CHH; - rtmp[3] = rtmp[7] = 0; - } - int32_t ct[2]; - if(strand == '-') { - ct[0] = g->counts[6]; - ct[1] = g->counts[4]; - } else { - ct[0] = g->counts[5]; - ct[1] = g->counts[7]; - } - int32_t cov = ct[0] + ct[1]; - double m = cov > 0 ? (double)ct[0] / (double)cov : 0.0; - if(cov > 0) { - FILE *fp = args->wigfile; - if(fp != NULL) { - if(rec->rid != old_rid) { - fprintf(fp, "variableStep chrom=%s\n", args->hdr->id[BCF_DT_CTG][rec->rid].key); - } - fprintf(fp, "%"PRId64"\t%.4g\n", rec->pos + 1, 100.0 * m); - } - old_rid = rec->rid; - } - FILE *fp = args->bedmethylfiles[btype]; - if(fp != NULL) { - int gq = calc_phred(1.0 - exp(g->gt_prob[g->max_gt])); // Prob. of not being called genotype - fprintf(fp, "%s\t%"PRId64"\t%"PRId64"\t\"%s\"\t%d\t%c\t%"PRId64"\t%"PRId64"\t%s\t%d\t%d\t%s\t%s\t%d\n", - args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos, rec->pos + 1, args->bedmethyl_desc, cov > 1000 ? 1000 : cov, strand, - rec->pos, rec->pos + 1, rgb_tab[(int)(m * 10.0 + 0.5)], cov, (int)(100.0 * m), rtmp, rtmp + 4, gq); - } - old_pos = rec->pos; - } -} diff --git a/tools/gemBS_plugins/snpxtr.c b/tools/gemBS_plugins/snpxtr.c deleted file mode 100644 index 36281536..00000000 --- a/tools/gemBS_plugins/snpxtr.c +++ /dev/null @@ -1,603 +0,0 @@ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "snpxtr.h" -#include "utils.h" -#include "compress.h" - -static args_t args = { - .hdr = NULL, - .output_filename = NULL, - .snp_filename = NULL, - .output_file = NULL, - .snp_hash = NULL, - .gt = NULL, - .compress = 0, - .pass_index = -1, - .gt_index = -1, - .dbSNP = NULL, - .dbSNP_header = NULL, - .dbSNP_prefix = NULL, - .dbSNP_name = NULL, - .n_dbSNP_prefixes = 0 -}; - -const char *about(void) -{ - return "Extract SNPs\n"; -} - -const char *usage(void) -{ - return - "\n" - "About: Extract SNPs from VCF file.\n" - "Usage: bcftools +snpxtr [General Options] -- [Plugin Options]\n" - "Options:\n" - " run \"bcftools plugin\" for a list of common options\n" - "\n" - "Plugin options:\n" - " -o, --output Output file (default = stdout)\n" - " -s, --snps File containing list of SNP IDs to be selected (default selected all sites with PASS))\n" - " -D, --dbsnp (dbSNP processed file)\n" - " -z, --gzip Compress output with gzip (bgzip if available)\n" - " -j, --bzip2 Compress output with bzip2\n" - " -x, --xz Compress output with xz\n" - "\n" - "Example:\n" - " bcftools +snpxtr in.vcf -- -s snp_list.txt -o out_snps.txt -z\n" - "\n"; -} - -static FILE *open_ofile(char *name, int compress) { - FILE *fp = NULL; - int comp_ix = COMPRESS_NONE; - if(compress != 0) { - for(comp_ix = 0; comp_ix < COMPRESS_NONE; comp_ix++) { - if(compress & (1 << comp_ix)) break; - } - } - struct compress *cdata = comp_ix < COMPRESS_NONE ? get_compress_data() : NULL; - if(name != NULL) { - if(comp_ix < COMPRESS_NONE) { - char *tname = name; - char *suffix = cdata->compress_suffix[comp_ix]; - // Check whether file name already has suffix - char *p = strrchr(tname, '.'); - if(p == NULL || strcmp(p + 1, suffix)) { - // No, so we will have to add it - asprintf(&tname, "%s.%s", name, suffix); - } - int i = child_open(WRITE, tname, cdata->comp_path[comp_ix][0]); - fp = fdopen(i, "w"); - if(name != tname) free(tname); - } else fp = fopen(name, "w"); - } else { - if(isatty(fileno(stdout))) comp_ix = COMPRESS_NONE; - if(comp_ix < COMPRESS_NONE) { - int i = child_open(WRITE, NULL, cdata->comp_path[comp_ix][0]); - fp = fdopen(i, "w"); - } else fp = stdout; - } - return fp; -} - -static void init_files(args_t *a) { - a->output_file = open_ofile(a->output_filename, a->compress); -} - -static void close_files(args_t *a) { - if(a->output_filename != NULL && a->output_file != stdout) fclose(a->output_file); -} - -void read_snp_file(void) { - bool filter; - FILE *fp = open_readfile_and_check(args.snp_filename, &filter); - char *buf = NULL; - size_t buf_size = 0; - tokens *tok = NULL; - int nsnps = 0; - fprintf(stderr,"Reading SNP list from %s\n", args.snp_filename); - for(;;) { - ssize_t l = getline(&buf, &buf_size, fp); - if(l < 0) break; - tok = tokenize(buf, '\t', tok); - if(tok->n_tok >= 1) { - char *p = tok->toks[0]; - char *id = malloc(strlen(p) + 3); - sprintf(id, "rs%s", p); - snp *s; - HASH_FIND_STR(args.snp_hash, id, s); - if(s == NULL) { - nsnps++; - s = malloc(sizeof(snp)); - s->name = id; - HASH_ADD_KEYPTR(hh, args.snp_hash, s->name, strlen(s->name), s); - } - } - } - fclose(fp); - if(buf) free(buf); - if(filter) { - int i; - while(waitpid(-1, &i, WNOHANG) > 0); - } - fprintf(stderr,"%d SNPs read in\n", nsnps); -} - -static void store_dbsnp_entries(dbsnp_bin *bin, int n_entries, int name_buf_sz, uint16_t *entries, uint8_t *name_buf) { - bin->entries = malloc(sizeof(uint16_t) * n_entries); - bin->name_buf = malloc((size_t)name_buf_sz); - bin->n_entries = n_entries; - uint64_t msk = (uint64_t)0; - for(int i = 0; i < n_entries; i++) { - bin->entries[i] = entries[i]; - msk |= ((uint64_t)1 << (entries[i] & 63)); - } - bin->mask = msk; - memcpy(bin->name_buf, name_buf, name_buf_sz); -} - -static uint8_t db_tab[] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, - 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x30, - 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, - 0x47, 0x48, 0x49, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x60, 0x61, 0x62, - 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, - 0x79, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x90, 0x91, 0x92, 0x93, 0x94, - 0x95, 0x96, 0x97, 0x98, 0x99, 0x0f, 0x1f, 0x2f, 0x3f, 0x4f, 0x5f, 0x6f, 0x7f, 0x8f, 0x9f, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff -}; - -static void read_dbSNP_file(void) { - bool ok = true; - bool filter; - FILE *fp = open_readfile_and_check(args.dbSNP_name, &filter); - char *buf = NULL; - size_t buf_size = 0; - fprintf(stderr,"Loading dbSNP from %s\n", args.dbSNP_name); - int cbin = 0, max_bin = 0, n_entries = 0, name_buf_ptr = 0, ix = -1; - dbsnp_bin *bins = NULL; - char *ctg_name = NULL; - dbsnp_ctg *ctg = NULL; - uint16_t *entries = malloc(sizeof(uint16_t) * 64); - uint8_t *name_buf = malloc(sizeof(uint8_t) * 256 * 64); - int n_snps = 0, n_bins = 0, n_ctgs = 0; - ssize_t l = getline(&buf, &buf_size, fp); - if(l > 0 && buf[l - 1] == '\n') buf[--l] = 0; - if(l > 7 && !strncmp(buf, "track ", 6)) { - args.dbSNP_header = malloc((size_t)(l - 5)); - memcpy(args.dbSNP_header, buf + 6, l - 6); - args.dbSNP_header[l - 6] = 0; - int n_p_store = 8; - char **p_store = malloc(sizeof(void *) * n_p_store); - while(true) { - l = getline(&buf, &buf_size, fp); - if(l > 0 && buf[l - 1] == '\n') buf[--l] = 0; - if(l < 1 || buf[0] != '+') break; - if(l > 1) { - if(args.n_dbSNP_prefixes == 0xffff) { - fprintf(stderr, "Error in dbSNP file: too many prefixes\n"); - exit(-1); - } - if(args.n_dbSNP_prefixes == n_p_store) { - n_p_store *= 1.5; - p_store = realloc(p_store, sizeof(void *) * n_p_store); - } - char *tp = p_store[args.n_dbSNP_prefixes++] = malloc(l); - memcpy(tp, buf + 1, l - 1); - tp[l - 1] = 0; - } - } - if(!args.n_dbSNP_prefixes) { - fprintf(stderr, "Error in dbSNP file: no prefix information\n"); - ok = false; - } else { - args.dbSNP_prefix = malloc(sizeof(void *) * args.n_dbSNP_prefixes); - memcpy(args.dbSNP_prefix, p_store, sizeof(void *) * args.n_dbSNP_prefixes); - free(p_store); - } - while(l >= 0 && ok) { - if(l > 0) { - if(buf[0] == '>') { - if(n_entries) { - store_dbsnp_entries(bins, n_entries, name_buf_ptr, entries, name_buf); - n_bins++; - n_snps += n_entries; - } - if(cbin != max_bin) { - fprintf(stderr, "Error in dbSNP file - wrong number of bins (expected %d, saw %d\n", max_bin, cbin); - ok = false; - break; - } - char *tp = strchr(buf + 1, '\t'); - if(!tp) { - fprintf(stderr,"Error in dbSNP file - bad chromosome header\n"); - ok = false; - break; - } - *tp = 0; - char *tp1; - cbin = (int)strtoul(tp + 1, &tp1, 10); - if(*tp1 != '\t') { - fprintf(stderr,"Error in dbSNP file - bad chromosome header\n"); - ok = false; - break; - } - max_bin = (int)strtoul(tp1 + 1, &tp, 10); - ctg_name = strdup(buf + 1); - HASH_FIND(hh, args.dbSNP, ctg_name, strlen(ctg_name), ctg); - if(ctg != NULL) { - fprintf(stderr,"Error in dbSNP file - duplicate contigs (%s)\n", ctg_name); - ok = false; - break; - } - ctg = malloc(sizeof(dbsnp_ctg)); - ctg->name = ctg_name; - ctg->min_bin = cbin; - ctg->max_bin = max_bin; - ctg->bins = malloc(sizeof(dbsnp_bin) * (max_bin - cbin + 1)); - HASH_ADD_KEYPTR(hh, args.dbSNP, ctg->name, strlen(ctg_name), ctg); - n_entries = name_buf_ptr = 0; - ix = -1; - n_ctgs++; - bins = ctg->bins; - } else { - if(ctg == NULL) { - fprintf(stderr,"Error in dbSNP file - missing contig header\n"); - ok = false; - break; - } - if(buf[0] == '+') { - if(n_entries) { - store_dbsnp_entries(bins, n_entries, name_buf_ptr, entries, name_buf); - n_bins++; - n_snps += n_entries; - } - char *tp; - int d = (int)strtoul(buf + 1, &tp, 10); - if(!d) d = 1; - cbin += d; - bins += d; - if(cbin > max_bin) { - fprintf(stderr,"Error in dbSNP file - too many bins for chromosome\n"); - ok = false; - break; - } - n_entries = name_buf_ptr = 0; - ix = -1; - } else { - if(n_entries == 64) { - fprintf(stderr,"Error in dbSNP file - too many entries for bin (max 64)\n"); - ok = false; - break; - } - if(l < 3 || l > 256) { - fprintf(stderr,"Error in dbSNP file - bad line length: %s\n", buf + 1); - ok = false; - break; - } - char tmp = buf[2]; - buf[2] = 0; - char *tp; - int ix1 = (int)strtoul(buf, &tp, 16); - buf[2] = tmp; - int prefix_ix = ix1 >> 6; - if(prefix_ix > args.n_dbSNP_prefixes) { - fprintf(stderr,"Error in dbSNP file - invalid prefix\n"); - ok = false; - break; - } - ix1 &= 63; - if(ix1 <= ix) { - fprintf(stderr,"Error in dbSNP file - entries out of order or invalid\n"); - ok = false; - break; - } - ix = ix1; - int kx = 2; - if(prefix_ix == 0) { - if(l < 7) { - ok = false; - fprintf(stderr,"Error in dbSNP file - bad line length: %s\n", buf); - break; - } - tmp = buf[4]; - buf[4] = 0; - uint8_t ix_high = (int)strtoul(buf + 2, &tp, 16); - buf[4] = tmp; - tmp = buf[6]; - buf[6] = 0; - uint8_t ix_low = (int)strtoul(buf + 4, &tp, 16); - buf[6] = tmp; - name_buf[name_buf_ptr++] = ix_high; - name_buf[name_buf_ptr++] = ix_low; - kx = 6; - } - int k = l - kx; - entries[n_entries++] = (k << 8) | (prefix_ix << 6) | ix; - uint8_t *tip = (uint8_t *)(buf + kx); - for(int j = 0; j < k; j++) name_buf[name_buf_ptr++] = db_tab[(int)tip[j]]; - } - } - } - l = getline(&buf, &buf_size, fp); - if(l > 0 && buf[l - 1] == '\n') buf[--l] = 0; - } - if(n_entries) { - store_dbsnp_entries(bins, n_entries, name_buf_ptr, entries, name_buf); - n_bins++; - n_snps += n_entries; - } - } else ok = false; - fclose(fp); - if(buf) free(buf); - if(filter) { - int i; - while(waitpid(-1, &i, WNOHANG) > 0); - } - if(ok) fprintf(stderr,"Completed loading dbSNP (no. contigs %d, no. bins %d, no. SNPs %d\n", n_ctgs, n_bins, n_snps); - else fprintf(stderr,"Error loading dbSNP\n"); -} - -int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out __unused__) -{ - args.hdr = in; - - static struct option loptions[] = { - {"output", required_argument, 0, 'o'}, - {"snps", required_argument, 0, 's'}, - {"dbsnp", required_argument, 0, 'D'}, - {"gzip", no_argument, 0, 'z'}, - {"bzip2", no_argument, 0, 'j'}, - {"xz", no_argument, 0, 'x'}, - {0,0,0,0} - }; - int c; - bool mult_comp = false; - while ((c = getopt_long(argc, argv, "?ho:s:D:zjx",loptions,NULL)) >= 0) { - switch (c) { - case 'o': - args.output_filename = optarg; - break; - case 's': - args.snp_filename = optarg; - break; - case 'D': - args.dbSNP_name = optarg; - break; - case 'z': - if(args.compress) mult_comp = true; - else args.compress |= COMP_GZIP; - break; - case 'j': - if(args.compress) mult_comp = true; - else args.compress |= COMP_BZIP2; - break; - case 'x': - if(args.compress) mult_comp = true; - else args.compress |= COMP_XZ; - break; - case 'h': - case '?': - default: error(usage()); break; - } - } - if(mult_comp) error("Can not combine multiple compression options\n"); - if (optind != argc) error(usage()); - init_files(&args); - int ns = bcf_hdr_nsamples(args.hdr); - assert(ns > 0); - args.gt = malloc(sizeof(int) * ns); - for(int i = 0; i < in->n[BCF_DT_ID]; i++) { - if(!strcmp("PASS" , in->id[BCF_DT_ID][i].key)) args.pass_index = i; - else if(!strcmp("GT" , in->id[BCF_DT_ID][i].key)) args.gt_index = i; - } - if(args.snp_filename != NULL) read_snp_file(); - if(args.dbSNP_name != NULL) read_dbSNP_file(); - FILE *fp = args.output_file; - fputs("Chrom\tPos\tId", fp); - for(int i = 0; i < ns; i++) { - fprintf(fp, "\t%s", in->id[BCF_DT_SAMPLE][i].key); - } - fputc('\n', fp); - return 1; -} - -static const int base_tab[256] = { - ['A'] = 1, ['C'] = 2, ['G'] = 3, ['T'] = 4 -}; - -static dbsnp_ctg *dbSNP_ctg; -static int curr_rid = -1; -static char dtab[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0, 0, 0, 0, 0, 0 }; - -bcf1_t *process(bcf1_t *rec) -{ - int ns = bcf_hdr_nsamples(args.hdr); - bcf_unpack(rec, BCF_UN_ALL); - char *id = rec->d.id; - char rs[512]; - void *dat_p = NULL; - int dat_n = 0; - FILE *fp = args.output_file; - if(id[1] == 0 && id[0] == '.') { - if(curr_rid != rec->rid) { - const char *ctg = args.hdr->id[BCF_DT_CTG][rec->rid].key; - if(args.dbSNP != NULL) { - HASH_FIND(hh, args.dbSNP, ctg, strlen(ctg), dbSNP_ctg); - } - curr_rid = rec->rid; - } - if(dbSNP_ctg != NULL) { - int x = rec->pos + 1; - int bn = x >> 6; - if(bn >= dbSNP_ctg->min_bin && bn <= dbSNP_ctg->max_bin) { - dbsnp_bin *b = dbSNP_ctg->bins + bn - dbSNP_ctg->min_bin; - int ix = x & 63; - uint64_t mk = (uint64_t)1 << ix; - if(b->mask & mk) { - uint64_t mk1 = b->mask & (mk - (uint64_t)1); - int i = 0, j = 0; - while(mk1) { - if(mk1 & (uint64_t)1) { - uint16_t en = b->entries[i++]; - j += en >> 8; - if(!((en >> 6) & 3)) j += 2; - } - mk1 >>= 1; - } - char *tp = rs; - int prefix_id = (b->entries[i] >> 6) & 3; - unsigned char *tp1 = b->name_buf + j; - if((prefix_id--) == 0) { - prefix_id = (tp1[0] << 8) | tp1[1]; - tp1+=2; - } - char *tp2 = args.dbSNP_prefix[prefix_id]; - while(*tp2) *tp++ = *tp2++; - j = b->entries[i] >> 8; - for(int k = 0; k < j; k++) { - unsigned char z = *tp1++; - *tp++ = dtab[z >> 4]; - *tp++ = dtab[z & 15]; - } - *tp = 0; - id = rs; - // fprintf(stderr,"%s\n", rs); - } - } - } - } - if(id != NULL && (id[0] != '.' || id[1] != 0)) { - bool passed = true; - if(args.snp_hash) { - snp *s; - HASH_FIND_STR(args.snp_hash, id, s); - if(!s) passed = false; - } - if(passed) { - // passed = false; - passed = true; - // Check overall filter - for(int i = 0; i < rec->d.n_flt; i++) { - if(rec->d.flt[i] == args.pass_index) { - passed = true; - break; - } - } - int n_all = rec->n_allele; - if(passed) { - // Check alleles (only allow SNPs) - if(n_all > 4) passed = false; - else { - for(int i = 0; i < rec->n_allele; i++) { - char *p = rec->d.allele[i]; - if(p[1] || !base_tab[(int)p[0]]) { - passed = false; - break; - } - } - } - } - if(passed) { - // Get filter tag - int ne = bcf_get_format_values(args.hdr, rec, "FT", &dat_p, &dat_n, BCF_HT_STR); - // Find GT tag - int gt_i = -1; - bcf_fmt_t *fmt = rec->d.fmt; - for(int i = 0; i < (int)rec->n_fmt; i++) { - if(!fmt[i].p) continue; - if(fmt[i].id == args.gt_index) { - gt_i = i; - break; - } - } - if(gt_i >= 0) { - int sz = ne / ns; - char *flt = dat_p; - bcf_fmt_t *fmt = rec->d.fmt + gt_i; - passed = false; - for(int i = 0; i < ns; i++) { - args.gt[i] = 0; - switch(fmt->type) { - case BCF_BT_INT8: - { - if(fmt->n == 2) { - int8_t *p = (int8_t *)(fmt->p + i * fmt->size); - if(p[0] != bcf_int8_vector_end && p[1] != bcf_int8_vector_end) { - int a1 = p[0] >> 1; - int a2 = p[1] >> 1; - if(a1 < 1 || a1 > n_all || a2 < 1 || a2 > n_all) args.gt[i] = 0; - else args.gt[i] = (a1 << 4) | a2; - } - } - } - break; - case BCF_BT_INT16: - { - if(fmt->n == 2) { - int16_t *p = (int16_t *)(fmt->p + i * fmt->size); - if(p[0] != bcf_int16_vector_end && p[1] != bcf_int16_vector_end) { - int a1 = p[0] >> 1; - int a2 = p[1] >> 1; - if(a1 < 1 || a1 > n_all || a2 < 1 || a2 > n_all) args.gt[i] = 0; - else args.gt[i] = (a1 << 4) | a2; - } - } - } - break; - case BCF_BT_INT32: - { - if(fmt->n == 2) { - int32_t *p = (int32_t *)(fmt->p + i * fmt->size); - if(p[0] != bcf_int32_vector_end && p[1] != bcf_int32_vector_end) { - int a1 = p[0] >> 1; - int a2 = p[1] >> 1; - if(a1 < 1 || a1 > n_all || a2 < 1 || a2 > n_all) args.gt[i] = 0; - else args.gt[i] = (a1 << 4) | a2; - } - } - } - break; - } - if(flt != NULL && strcmp("PASS", flt + i * sz)) args.gt[i] = 0; - if(args.gt[i]) passed = true; - } - if(passed) { - fprintf(fp, "%s\t%" PRId64 "\t%s", args.hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + 1, id); - for(int i = 0; i < ns; i++) { - const int gt = args.gt[i]; - if(gt > 0) fprintf(fp, "\t%s%s", rec->d.allele[(gt >> 4) - 1], rec->d.allele[(gt & 7) - 1]); - else fputs("\t00", fp); - } - fputc('\n',fp); - } - } - } - } - } - return NULL; -} - -void destroy(void) -{ - close_files(&args); -} diff --git a/tools/gemBS_plugins/snpxtr.h b/tools/gemBS_plugins/snpxtr.h deleted file mode 100644 index 8cb76b23..00000000 --- a/tools/gemBS_plugins/snpxtr.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef SNPXTR_H_ -#define SNPXTR_H_ - -#include -#include -#include -#include -#include -#include - -#include "uthash.h" - -#define COMP_GZIP (1 << COMPRESS_GZIP) -#define COMP_BZIP2 (1 << COMPRESS_BZIP2) -#define COMP_XZ (1 << COMPRESS_XZ) - -void error(const char *format, ...) HTS_NORETURN; - -typedef struct { - uint64_t mask; - int n_entries; - uint16_t *entries; - uint8_t *name_buf; -} dbsnp_bin; - -typedef struct { - char *name; - int min_bin; - int max_bin; - dbsnp_bin *bins; - UT_hash_handle hh; -} dbsnp_ctg; - -typedef struct { - char *name; - UT_hash_handle hh; -} snp; - -typedef struct { - bcf_hdr_t *hdr; - char *output_filename; - char *snp_filename; - FILE *output_file; - snp *snp_hash; - char *dbSNP_name; - dbsnp_ctg *dbSNP; - char **dbSNP_prefix; - char *dbSNP_header; - uint16_t n_dbSNP_prefixes; - int *gt; - int compress; - int pass_index; - int gt_index; -} args_t; - -#endif // SNPXTR_H_ diff --git a/tools/gemBS_plugins/snpxtr.mk b/tools/gemBS_plugins/snpxtr.mk deleted file mode 100644 index f1190e0e..00000000 --- a/tools/gemBS_plugins/snpxtr.mk +++ /dev/null @@ -1,2 +0,0 @@ -plugins/snpxtr.so: plugins/snpxtr.c version.h version.c ../gemBS_plugins/utils.c ../gemBS_plugins/compress.c ../gemBS_plugins/compress.h ../gemBS_plugins/utils.h ../gemBS_plugins/uthash.h ../gemBS_plugins/snpxtr.h - $(CC) --std=gnu11 $(PLUGIN_FLAGS) $(CFLAGS) $(ALL_CPPFLAGS) -I ../gemBS_plugins $(EXTRA_CPPFLAGS) $(LDFLAGS) -o $@ ../gemBS_plugins/utils.c ../gemBS_plugins/compress.c version.c $< $(LIBS) diff --git a/tools/gemBS_plugins/uthash.h b/tools/gemBS_plugins/uthash.h deleted file mode 100644 index fe2d51b6..00000000 --- a/tools/gemBS_plugins/uthash.h +++ /dev/null @@ -1,917 +0,0 @@ -/* -Copyright (c) 2003-2013, Troy D. Hanson http://uthash.sourceforge.net -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER -OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifndef UTHASH_H -#define UTHASH_H - -#include /* memcmp,strlen */ -#include /* ptrdiff_t */ -#include /* exit() */ - -/* These macros use decltype or the earlier __typeof GNU extension. - As decltype is only available in newer compilers (VS2010 or gcc 4.3+ - when compiling c++ source) this code uses whatever method is needed - or, for VS2008 where neither is available, uses casting workarounds. */ -#ifdef _MSC_VER /* MS compiler */ -#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ -#define DECLTYPE(x) (decltype(x)) -#else /* VS2008 or older (or VS2010 in C mode) */ -#define NO_DECLTYPE -#define DECLTYPE(x) -#endif -#else /* GNU, Sun and other compilers */ -#define DECLTYPE(x) (__typeof(x)) -#endif - -#ifdef NO_DECLTYPE -#define DECLTYPE_ASSIGN(dst,src) \ -do { \ - char **_da_dst = (char**)(&(dst)); \ - *_da_dst = (char*)(src); \ -} while(0) -#else -#define DECLTYPE_ASSIGN(dst,src) \ -do { \ - (dst) = DECLTYPE(dst)(src); \ -} while(0) -#endif - -/* a number of the hash function use uint32_t which isn't defined on win32 */ -#ifdef _MSC_VER -typedef unsigned int uint32_t; -typedef unsigned char uint8_t; -#else -#include /* uint32_t */ -#endif - -#define UTHASH_VERSION 1.9.7 - -#ifndef uthash_fatal -#define uthash_fatal(msg) exit(-1) /* fatal error (out of memory,etc) */ -#endif -#ifndef uthash_malloc -#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ -#endif -#ifndef uthash_free -#define uthash_free(ptr,sz) free(ptr) /* free fcn */ -#endif - -#ifndef uthash_noexpand_fyi -#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ -#endif -#ifndef uthash_expand_fyi -#define uthash_expand_fyi(tbl) /* can be defined to log expands */ -#endif - -/* initial number of buckets */ -#define HASH_INITIAL_NUM_BUCKETS 32 /* initial number of buckets */ -#define HASH_INITIAL_NUM_BUCKETS_LOG2 5 /* lg2 of initial number of buckets */ -#define HASH_BKT_CAPACITY_THRESH 10 /* expand when bucket count reaches */ - -/* calculate the element whose hash handle address is hhe */ -#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) - -#define HASH_FIND(hh,head,keyptr,keylen,out) \ -do { \ - unsigned _hf_bkt,_hf_hashv; \ - out=NULL; \ - if (head) { \ - HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt); \ - if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) { \ - HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], \ - keyptr,keylen,out); \ - } \ - } \ -} while (0) - -#ifdef HASH_BLOOM -#define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM) -#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0) -#define HASH_BLOOM_MAKE(tbl) \ -do { \ - (tbl)->bloom_nbits = HASH_BLOOM; \ - (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ - if (!((tbl)->bloom_bv)) { uthash_fatal( "out of memory"); } \ - memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN); \ - (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ -} while (0) - -#define HASH_BLOOM_FREE(tbl) \ -do { \ - uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ -} while (0) - -#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8))) -#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8))) - -#define HASH_BLOOM_ADD(tbl,hashv) \ - HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) - -#define HASH_BLOOM_TEST(tbl,hashv) \ - HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) - -#else -#define HASH_BLOOM_MAKE(tbl) -#define HASH_BLOOM_FREE(tbl) -#define HASH_BLOOM_ADD(tbl,hashv) -#define HASH_BLOOM_TEST(tbl,hashv) (1) -#endif - -#define HASH_MAKE_TABLE(hh,head) \ -do { \ - (head)->hh.tbl = (UT_hash_table*)uthash_malloc( \ - sizeof(UT_hash_table)); \ - if (!((head)->hh.tbl)) { uthash_fatal( "out of memory"); } \ - memset((head)->hh.tbl, 0, sizeof(UT_hash_table)); \ - (head)->hh.tbl->tail = &((head)->hh); \ - (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ - (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ - (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ - (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ - HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ - if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); } \ - memset((head)->hh.tbl->buckets, 0, \ - HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ - HASH_BLOOM_MAKE((head)->hh.tbl); \ - (head)->hh.tbl->signature = HASH_SIGNATURE; \ -} while(0) - -#define HASH_ADD(hh,head,fieldname,keylen_in,add) \ - HASH_ADD_KEYPTR(hh,head,&((add)->fieldname),keylen_in,add) - -#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ -do { \ - unsigned _ha_bkt; \ - (add)->hh.next = NULL; \ - (add)->hh.key = (char*)keyptr; \ - (add)->hh.keylen = (unsigned)keylen_in; \ - if (!(head)) { \ - head = (add); \ - (head)->hh.prev = NULL; \ - HASH_MAKE_TABLE(hh,head); \ - } else { \ - (head)->hh.tbl->tail->next = (add); \ - (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ - (head)->hh.tbl->tail = &((add)->hh); \ - } \ - (head)->hh.tbl->num_items++; \ - (add)->hh.tbl = (head)->hh.tbl; \ - HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets, \ - (add)->hh.hashv, _ha_bkt); \ - HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh); \ - HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv); \ - HASH_EMIT_KEY(hh,head,keyptr,keylen_in); \ - HASH_FSCK(hh,head); \ -} while(0) - -#define HASH_TO_BKT( hashv, num_bkts, bkt ) \ -do { \ - bkt = ((hashv) & ((num_bkts) - 1)); \ -} while(0) - -/* delete "delptr" from the hash table. - * "the usual" patch-up process for the app-order doubly-linked-list. - * The use of _hd_hh_del below deserves special explanation. - * These used to be expressed using (delptr) but that led to a bug - * if someone used the same symbol for the head and deletee, like - * HASH_DELETE(hh,users,users); - * We want that to work, but by changing the head (users) below - * we were forfeiting our ability to further refer to the deletee (users) - * in the patch-up process. Solution: use scratch space to - * copy the deletee pointer, then the latter references are via that - * scratch pointer rather than through the repointed (users) symbol. - */ -#define HASH_DELETE(hh,head,delptr) \ -do { \ - unsigned _hd_bkt; \ - struct UT_hash_handle *_hd_hh_del; \ - if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) ) { \ - uthash_free((head)->hh.tbl->buckets, \ - (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ - HASH_BLOOM_FREE((head)->hh.tbl); \ - uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ - head = NULL; \ - } else { \ - _hd_hh_del = &((delptr)->hh); \ - if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) { \ - (head)->hh.tbl->tail = \ - (UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \ - (head)->hh.tbl->hho); \ - } \ - if ((delptr)->hh.prev) { \ - ((UT_hash_handle*)((ptrdiff_t)((delptr)->hh.prev) + \ - (head)->hh.tbl->hho))->next = (delptr)->hh.next; \ - } else { \ - DECLTYPE_ASSIGN(head,(delptr)->hh.next); \ - } \ - if (_hd_hh_del->next) { \ - ((UT_hash_handle*)((ptrdiff_t)_hd_hh_del->next + \ - (head)->hh.tbl->hho))->prev = \ - _hd_hh_del->prev; \ - } \ - HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ - HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ - (head)->hh.tbl->num_items--; \ - } \ - HASH_FSCK(hh,head); \ -} while (0) - - -/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ -#define HASH_FIND_STR(head,findstr,out) \ - HASH_FIND(hh,head,findstr,strlen(findstr),out) -#define HASH_ADD_STR(head,strfield,add) \ - HASH_ADD(hh,head,strfield,strlen(add->strfield),add) -#define HASH_FIND_INT(head,findint,out) \ - HASH_FIND(hh,head,findint,sizeof(int),out) -#define HASH_ADD_INT(head,intfield,add) \ - HASH_ADD(hh,head,intfield,sizeof(int),add) -#define HASH_FIND_PTR(head,findptr,out) \ - HASH_FIND(hh,head,findptr,sizeof(void *),out) -#define HASH_ADD_PTR(head,ptrfield,add) \ - HASH_ADD(hh,head,ptrfield,sizeof(void *),add) -#define HASH_DEL(head,delptr) \ - HASH_DELETE(hh,head,delptr) - -/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. - * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. - */ -#ifdef HASH_DEBUG -#define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0) -#define HASH_FSCK(hh,head) \ -do { \ - unsigned _bkt_i; \ - unsigned _count, _bkt_count; \ - char *_prev; \ - struct UT_hash_handle *_thh; \ - if (head) { \ - _count = 0; \ - for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) { \ - _bkt_count = 0; \ - _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ - _prev = NULL; \ - while (_thh) { \ - if (_prev != (char*)(_thh->hh_prev)) { \ - HASH_OOPS("invalid hh_prev %p, actual %p\n", \ - _thh->hh_prev, _prev ); \ - } \ - _bkt_count++; \ - _prev = (char*)(_thh); \ - _thh = _thh->hh_next; \ - } \ - _count += _bkt_count; \ - if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ - HASH_OOPS("invalid bucket count %d, actual %d\n", \ - (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ - } \ - } \ - if (_count != (head)->hh.tbl->num_items) { \ - HASH_OOPS("invalid hh item count %d, actual %d\n", \ - (head)->hh.tbl->num_items, _count ); \ - } \ - /* traverse hh in app order; check next/prev integrity, count */ \ - _count = 0; \ - _prev = NULL; \ - _thh = &(head)->hh; \ - while (_thh) { \ - _count++; \ - if (_prev !=(char*)(_thh->prev)) { \ - HASH_OOPS("invalid prev %p, actual %p\n", \ - _thh->prev, _prev ); \ - } \ - _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ - _thh = ( _thh->next ? (UT_hash_handle*)((char*)(_thh->next) + \ - (head)->hh.tbl->hho) : NULL ); \ - } \ - if (_count != (head)->hh.tbl->num_items) { \ - HASH_OOPS("invalid app item count %d, actual %d\n", \ - (head)->hh.tbl->num_items, _count ); \ - } \ - } \ -} while (0) -#else -#define HASH_FSCK(hh,head) -#endif - -/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to - * the descriptor to which this macro is defined for tuning the hash function. - * The app can #include to get the prototype for write(2). */ -#ifdef HASH_EMIT_KEYS -#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ -do { \ - unsigned _klen = fieldlen; \ - write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ - write(HASH_EMIT_KEYS, keyptr, fieldlen); \ -} while (0) -#else -#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) -#endif - -/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */ -#ifdef HASH_FUNCTION -#define HASH_FCN HASH_FUNCTION -#else -#define HASH_FCN HASH_JEN -#endif - -/* The Bernstein hash function, used in Perl prior to v5.6 */ -#define HASH_BER(key,keylen,num_bkts,hashv,bkt) \ -do { \ - unsigned _hb_keylen=keylen; \ - char *_hb_key=(char*)(key); \ - (hashv) = 0; \ - while (_hb_keylen--) { (hashv) = ((hashv) * 33) + *_hb_key++; } \ - bkt = (hashv) & (num_bkts-1); \ -} while (0) - - -/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at - * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */ -#define HASH_SAX(key,keylen,num_bkts,hashv,bkt) \ -do { \ - unsigned _sx_i; \ - char *_hs_key=(char*)(key); \ - hashv = 0; \ - for(_sx_i=0; _sx_i < keylen; _sx_i++) \ - hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ - bkt = hashv & (num_bkts-1); \ -} while (0) - -#define HASH_FNV(key,keylen,num_bkts,hashv,bkt) \ -do { \ - unsigned _fn_i; \ - char *_hf_key=(char*)(key); \ - hashv = 2166136261UL; \ - for(_fn_i=0; _fn_i < keylen; _fn_i++) \ - hashv = (hashv * 16777619) ^ _hf_key[_fn_i]; \ - bkt = hashv & (num_bkts-1); \ -} while(0) - -#define HASH_OAT(key,keylen,num_bkts,hashv,bkt) \ -do { \ - unsigned _ho_i; \ - char *_ho_key=(char*)(key); \ - hashv = 0; \ - for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ - hashv += _ho_key[_ho_i]; \ - hashv += (hashv << 10); \ - hashv ^= (hashv >> 6); \ - } \ - hashv += (hashv << 3); \ - hashv ^= (hashv >> 11); \ - hashv += (hashv << 15); \ - bkt = hashv & (num_bkts-1); \ -} while(0) - -#define HASH_JEN_MIX(a,b,c) \ -do { \ - a -= b; a -= c; a ^= ( c >> 13 ); \ - b -= c; b -= a; b ^= ( a << 8 ); \ - c -= a; c -= b; c ^= ( b >> 13 ); \ - a -= b; a -= c; a ^= ( c >> 12 ); \ - b -= c; b -= a; b ^= ( a << 16 ); \ - c -= a; c -= b; c ^= ( b >> 5 ); \ - a -= b; a -= c; a ^= ( c >> 3 ); \ - b -= c; b -= a; b ^= ( a << 10 ); \ - c -= a; c -= b; c ^= ( b >> 15 ); \ -} while (0) - -#define HASH_JEN(key,keylen,num_bkts,hashv,bkt) \ -do { \ - unsigned _hj_i,_hj_j,_hj_k; \ - char *_hj_key=(char*)(key); \ - hashv = 0xfeedbeef; \ - _hj_i = _hj_j = 0x9e3779b9; \ - _hj_k = (unsigned)keylen; \ - while (_hj_k >= 12) { \ - _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ - + ( (unsigned)_hj_key[2] << 16 ) \ - + ( (unsigned)_hj_key[3] << 24 ) ); \ - _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ - + ( (unsigned)_hj_key[6] << 16 ) \ - + ( (unsigned)_hj_key[7] << 24 ) ); \ - hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ - + ( (unsigned)_hj_key[10] << 16 ) \ - + ( (unsigned)_hj_key[11] << 24 ) ); \ - \ - HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ - \ - _hj_key += 12; \ - _hj_k -= 12; \ - } \ - hashv += keylen; \ - switch ( _hj_k ) { \ - case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); \ - case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); \ - case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); \ - case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); \ - case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); \ - case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); \ - case 5: _hj_j += _hj_key[4]; \ - case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); \ - case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); \ - case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); \ - case 1: _hj_i += _hj_key[0]; \ - } \ - HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ - bkt = hashv & (num_bkts-1); \ -} while(0) - -/* The Paul Hsieh hash function */ -#undef get16bits -#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ - || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) -#define get16bits(d) (*((const uint16_t *) (d))) -#endif - -#if !defined (get16bits) -#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ - +(uint32_t)(((const uint8_t *)(d))[0]) ) -#endif -#define HASH_SFH(key,keylen,num_bkts,hashv,bkt) \ -do { \ - char *_sfh_key=(char*)(key); \ - uint32_t _sfh_tmp, _sfh_len = keylen; \ - \ - int _sfh_rem = _sfh_len & 3; \ - _sfh_len >>= 2; \ - hashv = 0xcafebabe; \ - \ - /* Main loop */ \ - for (;_sfh_len > 0; _sfh_len--) { \ - hashv += get16bits (_sfh_key); \ - _sfh_tmp = (get16bits (_sfh_key+2) << 11) ^ hashv; \ - hashv = (hashv << 16) ^ _sfh_tmp; \ - _sfh_key += 2*sizeof (uint16_t); \ - hashv += hashv >> 11; \ - } \ - \ - /* Handle end cases */ \ - switch (_sfh_rem) { \ - case 3: hashv += get16bits (_sfh_key); \ - hashv ^= hashv << 16; \ - hashv ^= _sfh_key[sizeof (uint16_t)] << 18; \ - hashv += hashv >> 11; \ - break; \ - case 2: hashv += get16bits (_sfh_key); \ - hashv ^= hashv << 11; \ - hashv += hashv >> 17; \ - break; \ - case 1: hashv += *_sfh_key; \ - hashv ^= hashv << 10; \ - hashv += hashv >> 1; \ - } \ - \ - /* Force "avalanching" of final 127 bits */ \ - hashv ^= hashv << 3; \ - hashv += hashv >> 5; \ - hashv ^= hashv << 4; \ - hashv += hashv >> 17; \ - hashv ^= hashv << 25; \ - hashv += hashv >> 6; \ - bkt = hashv & (num_bkts-1); \ -} while(0) - -#ifdef HASH_USING_NO_STRICT_ALIASING -/* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads. - * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error. - * MurmurHash uses the faster approach only on CPU's where we know it's safe. - * - * Note the preprocessor built-in defines can be emitted using: - * - * gcc -m64 -dM -E - < /dev/null (on gcc) - * cc -## a.c (where a.c is a simple test file) (Sun Studio) - */ -#if (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86)) -#define MUR_GETBLOCK(p,i) p[i] -#else /* non intel */ -#define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0) -#define MUR_PLUS1_ALIGNED(p) (((unsigned long)p & 0x3) == 1) -#define MUR_PLUS2_ALIGNED(p) (((unsigned long)p & 0x3) == 2) -#define MUR_PLUS3_ALIGNED(p) (((unsigned long)p & 0x3) == 3) -#define WP(p) ((uint32_t*)((unsigned long)(p) & ~3UL)) -#if (defined(__BIG_ENDIAN__) || defined(SPARC) || defined(__ppc__) || defined(__ppc64__)) -#define MUR_THREE_ONE(p) ((((*WP(p))&0x00ffffff) << 8) | (((*(WP(p)+1))&0xff000000) >> 24)) -#define MUR_TWO_TWO(p) ((((*WP(p))&0x0000ffff) <<16) | (((*(WP(p)+1))&0xffff0000) >> 16)) -#define MUR_ONE_THREE(p) ((((*WP(p))&0x000000ff) <<24) | (((*(WP(p)+1))&0xffffff00) >> 8)) -#else /* assume little endian non-intel */ -#define MUR_THREE_ONE(p) ((((*WP(p))&0xffffff00) >> 8) | (((*(WP(p)+1))&0x000000ff) << 24)) -#define MUR_TWO_TWO(p) ((((*WP(p))&0xffff0000) >>16) | (((*(WP(p)+1))&0x0000ffff) << 16)) -#define MUR_ONE_THREE(p) ((((*WP(p))&0xff000000) >>24) | (((*(WP(p)+1))&0x00ffffff) << 8)) -#endif -#define MUR_GETBLOCK(p,i) (MUR_PLUS0_ALIGNED(p) ? ((p)[i]) : \ - (MUR_PLUS1_ALIGNED(p) ? MUR_THREE_ONE(p) : \ - (MUR_PLUS2_ALIGNED(p) ? MUR_TWO_TWO(p) : \ - MUR_ONE_THREE(p)))) -#endif -#define MUR_ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) -#define MUR_FMIX(_h) \ -do { \ - _h ^= _h >> 16; \ - _h *= 0x85ebca6b; \ - _h ^= _h >> 13; \ - _h *= 0xc2b2ae35l; \ - _h ^= _h >> 16; \ -} while(0) - -#define HASH_MUR(key,keylen,num_bkts,hashv,bkt) \ -do { \ - const uint8_t *_mur_data = (const uint8_t*)(key); \ - const int _mur_nblocks = (keylen) / 4; \ - uint32_t _mur_h1 = 0xf88D5353; \ - uint32_t _mur_c1 = 0xcc9e2d51; \ - uint32_t _mur_c2 = 0x1b873593; \ - uint32_t _mur_k1 = 0; \ - const uint8_t *_mur_tail; \ - const uint32_t *_mur_blocks = (const uint32_t*)(_mur_data+_mur_nblocks*4); \ - int _mur_i; \ - for(_mur_i = -_mur_nblocks; _mur_i; _mur_i++) { \ - _mur_k1 = MUR_GETBLOCK(_mur_blocks,_mur_i); \ - _mur_k1 *= _mur_c1; \ - _mur_k1 = MUR_ROTL32(_mur_k1,15); \ - _mur_k1 *= _mur_c2; \ - \ - _mur_h1 ^= _mur_k1; \ - _mur_h1 = MUR_ROTL32(_mur_h1,13); \ - _mur_h1 = _mur_h1*5+0xe6546b64; \ - } \ - _mur_tail = (const uint8_t*)(_mur_data + _mur_nblocks*4); \ - _mur_k1=0; \ - switch((keylen) & 3) { \ - case 3: _mur_k1 ^= _mur_tail[2] << 16; \ - case 2: _mur_k1 ^= _mur_tail[1] << 8; \ - case 1: _mur_k1 ^= _mur_tail[0]; \ - _mur_k1 *= _mur_c1; \ - _mur_k1 = MUR_ROTL32(_mur_k1,15); \ - _mur_k1 *= _mur_c2; \ - _mur_h1 ^= _mur_k1; \ - } \ - _mur_h1 ^= (keylen); \ - MUR_FMIX(_mur_h1); \ - hashv = _mur_h1; \ - bkt = hashv & (num_bkts-1); \ -} while(0) -#endif /* HASH_USING_NO_STRICT_ALIASING */ - -/* key comparison function; return 0 if keys equal */ -#define HASH_KEYCMP(a,b,len) memcmp(a,b,len) - -/* iterate over items in a known bucket to find desired item */ -#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out) \ -do { \ - if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head)); \ - else out=NULL; \ - while (out) { \ - if ((out)->hh.keylen == keylen_in) { \ - if ((HASH_KEYCMP((out)->hh.key,keyptr,keylen_in)) == 0) break; \ - } \ - if ((out)->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,(out)->hh.hh_next)); \ - else out = NULL; \ - } \ -} while(0) - -/* add an item to a bucket */ -#define HASH_ADD_TO_BKT(head,addhh) \ -do { \ - head.count++; \ - (addhh)->hh_next = head.hh_head; \ - (addhh)->hh_prev = NULL; \ - if (head.hh_head) { (head).hh_head->hh_prev = (addhh); } \ - (head).hh_head=addhh; \ - if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH) \ - && (addhh)->tbl->noexpand != 1) { \ - HASH_EXPAND_BUCKETS((addhh)->tbl); \ - } \ -} while(0) - -/* remove an item from a given bucket */ -#define HASH_DEL_IN_BKT(hh,head,hh_del) \ - (head).count--; \ - if ((head).hh_head == hh_del) { \ - (head).hh_head = hh_del->hh_next; \ - } \ - if (hh_del->hh_prev) { \ - hh_del->hh_prev->hh_next = hh_del->hh_next; \ - } \ - if (hh_del->hh_next) { \ - hh_del->hh_next->hh_prev = hh_del->hh_prev; \ - } - -/* Bucket expansion has the effect of doubling the number of buckets - * and redistributing the items into the new buckets. Ideally the - * items will distribute more or less evenly into the new buckets - * (the extent to which this is true is a measure of the quality of - * the hash function as it applies to the key domain). - * - * With the items distributed into more buckets, the chain length - * (item count) in each bucket is reduced. Thus by expanding buckets - * the hash keeps a bound on the chain length. This bounded chain - * length is the essence of how a hash provides constant time lookup. - * - * The calculation of tbl->ideal_chain_maxlen below deserves some - * explanation. First, keep in mind that we're calculating the ideal - * maximum chain length based on the *new* (doubled) bucket count. - * In fractions this is just n/b (n=number of items,b=new num buckets). - * Since the ideal chain length is an integer, we want to calculate - * ceil(n/b). We don't depend on floating point arithmetic in this - * hash, so to calculate ceil(n/b) with integers we could write - * - * ceil(n/b) = (n/b) + ((n%b)?1:0) - * - * and in fact a previous version of this hash did just that. - * But now we have improved things a bit by recognizing that b is - * always a power of two. We keep its base 2 log handy (call it lb), - * so now we can write this with a bit shift and logical AND: - * - * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) - * - */ -#define HASH_EXPAND_BUCKETS(tbl) \ -do { \ - unsigned _he_bkt; \ - unsigned _he_bkt_i; \ - struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ - UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ - _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ - 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ - if (!_he_new_buckets) { uthash_fatal( "out of memory"); } \ - memset(_he_new_buckets, 0, \ - 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ - tbl->ideal_chain_maxlen = \ - (tbl->num_items >> (tbl->log2_num_buckets+1)) + \ - ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0); \ - tbl->nonideal_items = 0; \ - for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++) \ - { \ - _he_thh = tbl->buckets[ _he_bkt_i ].hh_head; \ - while (_he_thh) { \ - _he_hh_nxt = _he_thh->hh_next; \ - HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt); \ - _he_newbkt = &(_he_new_buckets[ _he_bkt ]); \ - if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) { \ - tbl->nonideal_items++; \ - _he_newbkt->expand_mult = _he_newbkt->count / \ - tbl->ideal_chain_maxlen; \ - } \ - _he_thh->hh_prev = NULL; \ - _he_thh->hh_next = _he_newbkt->hh_head; \ - if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev = \ - _he_thh; \ - _he_newbkt->hh_head = _he_thh; \ - _he_thh = _he_hh_nxt; \ - } \ - } \ - uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ - tbl->num_buckets *= 2; \ - tbl->log2_num_buckets++; \ - tbl->buckets = _he_new_buckets; \ - tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ? \ - (tbl->ineff_expands+1) : 0; \ - if (tbl->ineff_expands > 1) { \ - tbl->noexpand=1; \ - uthash_noexpand_fyi(tbl); \ - } \ - uthash_expand_fyi(tbl); \ -} while(0) - - -/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ -/* Note that HASH_SORT assumes the hash handle name to be hh. - * HASH_SRT was added to allow the hash handle name to be passed in. */ -#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) -#define HASH_SRT(hh,head,cmpfcn) \ -do { \ - unsigned _hs_i; \ - unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ - struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ - if (head) { \ - _hs_insize = 1; \ - _hs_looping = 1; \ - _hs_list = &((head)->hh); \ - while (_hs_looping) { \ - _hs_p = _hs_list; \ - _hs_list = NULL; \ - _hs_tail = NULL; \ - _hs_nmerges = 0; \ - while (_hs_p) { \ - _hs_nmerges++; \ - _hs_q = _hs_p; \ - _hs_psize = 0; \ - for ( _hs_i = 0; _hs_i < _hs_insize; _hs_i++ ) { \ - _hs_psize++; \ - _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ - ((void*)((char*)(_hs_q->next) + \ - (head)->hh.tbl->hho)) : NULL); \ - if (! (_hs_q) ) break; \ - } \ - _hs_qsize = _hs_insize; \ - while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) { \ - if (_hs_psize == 0) { \ - _hs_e = _hs_q; \ - _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ - ((void*)((char*)(_hs_q->next) + \ - (head)->hh.tbl->hho)) : NULL); \ - _hs_qsize--; \ - } else if ( (_hs_qsize == 0) || !(_hs_q) ) { \ - _hs_e = _hs_p; \ - _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ - ((void*)((char*)(_hs_p->next) + \ - (head)->hh.tbl->hho)) : NULL); \ - _hs_psize--; \ - } else if (( \ - cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \ - DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \ - ) <= 0) { \ - _hs_e = _hs_p; \ - _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ - ((void*)((char*)(_hs_p->next) + \ - (head)->hh.tbl->hho)) : NULL); \ - _hs_psize--; \ - } else { \ - _hs_e = _hs_q; \ - _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ - ((void*)((char*)(_hs_q->next) + \ - (head)->hh.tbl->hho)) : NULL); \ - _hs_qsize--; \ - } \ - if ( _hs_tail ) { \ - _hs_tail->next = ((_hs_e) ? \ - ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL); \ - } else { \ - _hs_list = _hs_e; \ - } \ - _hs_e->prev = ((_hs_tail) ? \ - ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL); \ - _hs_tail = _hs_e; \ - } \ - _hs_p = _hs_q; \ - } \ - _hs_tail->next = NULL; \ - if ( _hs_nmerges <= 1 ) { \ - _hs_looping=0; \ - (head)->hh.tbl->tail = _hs_tail; \ - DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ - } \ - _hs_insize *= 2; \ - } \ - HASH_FSCK(hh,head); \ - } \ -} while (0) - -/* This function selects items from one hash into another hash. - * The end result is that the selected items have dual presence - * in both hashes. There is no copy of the items made; rather - * they are added into the new hash through a secondary hash - * hash handle that must be present in the structure. */ -#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ -do { \ - unsigned _src_bkt, _dst_bkt; \ - void *_last_elt=NULL, *_elt; \ - UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ - ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ - if (src) { \ - for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ - for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ - _src_hh; \ - _src_hh = _src_hh->hh_next) { \ - _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ - if (cond(_elt)) { \ - _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho); \ - _dst_hh->key = _src_hh->key; \ - _dst_hh->keylen = _src_hh->keylen; \ - _dst_hh->hashv = _src_hh->hashv; \ - _dst_hh->prev = _last_elt; \ - _dst_hh->next = NULL; \ - if (_last_elt_hh) { _last_elt_hh->next = _elt; } \ - if (!dst) { \ - DECLTYPE_ASSIGN(dst,_elt); \ - HASH_MAKE_TABLE(hh_dst,dst); \ - } else { \ - _dst_hh->tbl = (dst)->hh_dst.tbl; \ - } \ - HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ - HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh); \ - (dst)->hh_dst.tbl->num_items++; \ - _last_elt = _elt; \ - _last_elt_hh = _dst_hh; \ - } \ - } \ - } \ - } \ - HASH_FSCK(hh_dst,dst); \ -} while (0) - -#define HASH_CLEAR(hh,head) \ -do { \ - if (head) { \ - uthash_free((head)->hh.tbl->buckets, \ - (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ - HASH_BLOOM_FREE((head)->hh.tbl); \ - uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ - (head)=NULL; \ - } \ -} while(0) - -#ifdef NO_DECLTYPE -#define HASH_ITER(hh,head,el,tmp) \ -for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL); \ - el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) -#else -#define HASH_ITER(hh,head,el,tmp) \ -for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL); \ - el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL)) -#endif - -/* obtain a count of items in the hash */ -#define HASH_COUNT(head) HASH_CNT(hh,head) -#define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0) - -typedef struct UT_hash_bucket { - struct UT_hash_handle *hh_head; - unsigned count; - - /* expand_mult is normally set to 0. In this situation, the max chain length - * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If - * the bucket's chain exceeds this length, bucket expansion is triggered). - * However, setting expand_mult to a non-zero value delays bucket expansion - * (that would be triggered by additions to this particular bucket) - * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. - * (The multiplier is simply expand_mult+1). The whole idea of this - * multiplier is to reduce bucket expansions, since they are expensive, in - * situations where we know that a particular bucket tends to be overused. - * It is better to let its chain length grow to a longer yet-still-bounded - * value, than to do an O(n) bucket expansion too often. - */ - unsigned expand_mult; - -} UT_hash_bucket; - -/* random signature used only to find hash tables in external analysis */ -#define HASH_SIGNATURE 0xa0111fe1 -#define HASH_BLOOM_SIGNATURE 0xb12220f2 - -typedef struct UT_hash_table { - UT_hash_bucket *buckets; - unsigned num_buckets, log2_num_buckets; - unsigned num_items; - struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ - ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ - - /* in an ideal situation (all buckets used equally), no bucket would have - * more than ceil(#items/#buckets) items. that's the ideal chain length. */ - unsigned ideal_chain_maxlen; - - /* nonideal_items is the number of items in the hash whose chain position - * exceeds the ideal chain maxlen. these items pay the penalty for an uneven - * hash distribution; reaching them in a chain traversal takes >ideal steps */ - unsigned nonideal_items; - - /* ineffective expands occur when a bucket doubling was performed, but - * afterward, more than half the items in the hash had nonideal chain - * positions. If this happens on two consecutive expansions we inhibit any - * further expansion, as it's not helping; this happens when the hash - * function isn't a good fit for the key domain. When expansion is inhibited - * the hash will still work, albeit no longer in constant time. */ - unsigned ineff_expands, noexpand; - - uint32_t signature; /* used only to find hash tables in external analysis */ -#ifdef HASH_BLOOM - uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ - uint8_t *bloom_bv; - char bloom_nbits; -#endif - -} UT_hash_table; - -typedef struct UT_hash_handle { - struct UT_hash_table *tbl; - void *prev; /* prev element in app order */ - void *next; /* next element in app order */ - struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ - struct UT_hash_handle *hh_next; /* next hh in bucket order */ - void *key; /* ptr to enclosing struct's key */ - unsigned keylen; /* enclosing struct's key len */ - unsigned hashv; /* result of hash-fcn(key) */ -} UT_hash_handle; - -#endif /* UTHASH_H */ diff --git a/tools/gemBS_plugins/utils.c b/tools/gemBS_plugins/utils.c deleted file mode 100644 index d2c8dc35..00000000 --- a/tools/gemBS_plugins/utils.c +++ /dev/null @@ -1,386 +0,0 @@ -/* -* utils.c -* -* Created on: 15 Sep 2016 -* Author: heath -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "compress.h" -#include "utils.h" - -#define STDIN STDIN_FILENO -#define STDOUT STDOUT_FILENO - -void qstrip(char *s) { - char *p, *p1; - - p = s; - p1 = s - 1; - while (*s) { - if (!isspace((int)*s)) - break; - s++; - } - while (*s) { - if (!isspace((int)*s)) - p1 = p; - *(p++) = *(s++); - } - *(++p1) = '\0'; -} - -tokens *tokenize(char *s, const int ch, tokens *tok) { - int n_toks = 0; - char **p = 0, *p1; - - if (!tok) { - tok = malloc(sizeof(tokens)); - if (tok) { - tok->size = 16; - if (tok) { - if (!(tok->toks = malloc(sizeof(void *) * tok->size))) { - free(tok); - tok = NULL; - } - } - } - } - if (tok != NULL) { - p = tok->toks; - if ((p1 = s)) { - if (!ch) { /* Split on white space */ - for (;;) { - while (*s && isspace((int)*s)) - s++; - if (!*s) - break; - if (n_toks == tok->size) { - tok->size <<= 1; - if (!(p = realloc(p, sizeof(void *) * tok->size))) { - free_tokens(tok); - tok = NULL; - break; - } - tok->toks = p; - } - p[n_toks++] = p1; - while (*s && !isspace((int)*s)) { - *p1++ = *s++; - } - if (*s) - s++; - *p1++ = 0; - } - } else { /* Split on token */ - for (;;) { - if (!*s) - break; - if (n_toks == tok->size) { - tok->size <<= 1; - if (!(p = realloc(p, sizeof(void *) * tok->size))) { - free_tokens(tok); - tok = NULL; - break; - } - tok->toks = p; - } - p[n_toks++] = p1; - while (*s && *s != ch) { - *p1++ = *s++; - } - if (*s) - s++; - *p1++ = 0; - qstrip(p[n_toks - 1]); - } - } - } - } - if (tok != NULL) { - if (n_toks == 1 && !*p[0]) - n_toks--; - tok->n_tok = n_toks; - } - return tok; -} - -char *find_prog(const char *prog, const char *path) { - char *p, *p1, *path1, *prog1, name[MAXPATHLEN]; - int sz, sz1, found, i; - struct stat buf; - tokens *tok; - - prog1 = strdup(prog); - found = 0; - tok = tokenize(prog1, ':', 0); - for (i = 0; !found && i < tok->n_tok; i++) { - sz1 = (int)strlen(tok->toks[i]); - if (!(p1 = path1 = strdup(path))) - return 0; - while ((p = strsep(&path1, ":"))) { - if (!*p) { - p = "."; - sz = 1; - } else { - sz = (int)strlen(p); - while (p[sz - 1] == '/') - p[--sz] = 0; - } - assert(sz + sz1 + 1 < MAXPATHLEN); - (void)snprintf(name, MAXPATHLEN, "%s/%s", p, tok->toks[i]); - if (!stat(name, &buf) && S_ISREG(buf.st_mode) && !access(name, X_OK)) { - found = 1; - break; - } - } - (void)free(p1); - } - free(prog1); - if (tok) - free_tokens(tok); - if (found) { - return strdup(name); - } - return 0; -} - -static void ignore_handler(__attribute__((unused)) int i) { /* Do nothing */ -} - -static int _child_open(const int read_flag, const char *fname, - const char *filterprog, const char *arg) { - int ppipe[2] = {-1, -1}, fd = -1, fd1; - struct stat sbuf; - struct sigaction s_action; - int childpid; - - if (read_flag == READ && fname) - if (stat(fname, &sbuf)) - return fd; - if (pipe(ppipe) < 0) { - (void)fprintf(stderr, "_child_open(): Can't open pipe\n"); - return fd; - } - childpid = fork(); - if (childpid < 0) { - (void)fprintf(stderr, "_child_open(): cannot fork\n"); - return fd; - } - if (childpid > 0) { /* Parent process */ - if (read_flag == READ) { - fd = ppipe[READ]; - if (close(ppipe[WRITE]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - } else { - fd = ppipe[WRITE]; - if (close(ppipe[READ]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - } - } else { /* Child process */ - errno = 0; - if (read_flag == READ) { - dup2(ppipe[WRITE], STDOUT); - if (close(ppipe[READ]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - if (fname) { - fd1 = open(fname, O_RDONLY, 0666); - if (fd1 < 0) { - (void)fprintf(stderr, "_child_open(): cannot open file %s\n", fname); - exit(EXIT_FAILURE); - } - dup2(fd1, STDIN); - } - } else { - dup2(ppipe[READ], STDIN); - if (close(ppipe[WRITE]) < 0) { - (void)fprintf(stderr, "_child_open(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - if (fname) { - fd1 = creat(fname, 0666); - if (fd1 < 0) { - (void)fprintf(stderr, "_child_open(): cannot open file %s\n", fname); - exit(EXIT_FAILURE); - } - dup2(fd1, STDOUT); - } - } - memset(&s_action, 0, sizeof(struct sigaction)); - s_action.sa_handler = ignore_handler; - s_action.sa_flags = 0; - (void)sigaction(SIGHUP, &s_action, 0L); - (void)sigaction(SIGINT, &s_action, 0L); - (void)sigaction(SIGQUIT, &s_action, 0L); - (void)sigaction(SIGPIPE, &s_action, 0L); - if (read_flag == READ) - (void)execlp(filterprog, filterprog, arg, (char *)0); - else - (void)execlp(filterprog, filterprog, arg, (char *)0); - (void)fprintf(stderr, "child_open(): cannot exec %s\n", filterprog); - _exit(EXIT_FAILURE); - } - return fd; -} - -int child_open(const int read_flag, const char *fname, const char *filterprog) { - int fd; - - if (read_flag == READ) - fd = _child_open(read_flag, fname, filterprog, "-d"); - else - fd = _child_open(read_flag, fname, filterprog, 0); - return fd; -} - -int child_open_rw(int fd[2], const char *filterprog, char *const argv[]) { - int read_pipe[2] = {-1, -1}, write_pipe[2] = {-1, -1}; - struct sigaction s_action; - int childpid; - - fd[0] = fd[1] = -1; - /* Open up a read pipe (from the filter) and a write pipe (to the filter) */ - if (pipe(read_pipe) < 0 || pipe(write_pipe) < 0) { - (void)fprintf(stderr, "child_open_rw(): Can't open pipe\n"); - return -1; - } - childpid = fork(); - if (childpid < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot fork\n"); - return -1; - } - - if (childpid > 0) { - /* In parent process */ - - /* Close write end of read pipe */ - fd[READ] = read_pipe[READ]; - if (close(read_pipe[WRITE]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - /* Close read end of write pipe */ - fd[WRITE] = write_pipe[WRITE]; - if (close(write_pipe[READ]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - } else { - /* In child process */ - - /* Duplicate STDOUT to write end of read pipe, and close read end */ - dup2(read_pipe[WRITE], STDOUT); - if (close(read_pipe[READ]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - /* Duplicate STDIN to read end of write pipe, and close write end */ - dup2(write_pipe[READ], STDIN); - if (close(write_pipe[WRITE]) < 0) { - (void)fprintf(stderr, "child_open_rw(): cannot close pipe\n"); - exit(EXIT_FAILURE); - } - s_action.sa_handler = ignore_handler; - s_action.sa_flags = 0; - (void)sigaction(SIGHUP, &s_action, 0L); - (void)sigaction(SIGINT, &s_action, 0L); - (void)sigaction(SIGQUIT, &s_action, 0L); - (void)sigaction(SIGPIPE, &s_action, 0L); - (void)execv(filterprog, argv); - (void)fprintf(stderr, "child_open_rw(): cannot exec %s\n", filterprog); - _exit(EXIT_FAILURE); - } - return 0; -} - -FILE *_open_readfile(const char *fname, bool *flag, bool chk_flag) { - int guess = COMPRESS_NONE; - FILE *fptr; - unsigned char buf[6]; - char *filter; - char *prog[] = {"gzip", "bzip2", "zip", "compress"}; - - errno = 0; - *flag = false; - if (fname == NULL) - return stdin; - struct compress *compress = get_compress_data(); - if (!(fptr = fopen(fname, "r"))) { - fprintf(stderr, "File Error: Couldn't open '%s' for reading (%s)\n", fname, - strerror(errno)); - if (chk_flag) - exit(-1); - else - return 0; - } - int i = (int)fread(buf, (size_t)1, (size_t)6, fptr); - if (i == 6) { - if (buf[0] == 0x1f) { - if (buf[1] == 0x9d) - guess = COMPRESS_COMPRESS; /* compress */ - else { - if (buf[1] == 0x8b && buf[2] == 0x08) - guess = COMPRESS_GZIP; /* gzip */ - } - } else { - if (buf[0] == 'B' && buf[1] == 'Z' && buf[2] == 'h' && buf[3] >= '0' && - buf[3] <= '9') - guess = COMPRESS_BZIP2; /* bzip2 */ - else { - if (buf[0] == 0xfd && buf[1] == '7' && buf[2] == 'z' && buf[3] == 'X' && buf[4] == 'Z' && buf[5] == 0) - guess = COMPRESS_XZ; /* xz */ - } - } - } - fclose(fptr); - if (guess < COMPRESS_NONE) { - filter = compress->comp_path[guess][0]; - if (filter) { - *flag = true; - i = _child_open(READ, fname, filter, "-d"); - if (!(fptr = fdopen(i, "r"))) { - fputs("Couldn't fdopen() stream", stderr); - exit(-1); - } - if (errno && errno != ESPIPE) { - fputs("Unknown IO error\n", stderr); - exit(-1); - } - errno = 0; - } else { - fprintf(stderr, "File '%s' appears to have been " - "compressed using %s, which is not in the " - "current $PATH\n", - fname, prog[guess]); - if (chk_flag) - exit(-1); - fptr = 0; - } - } else { - if (!(fptr = fopen(fname, "r"))) { - fprintf(stderr, "File Error Couldn't open '%s' for reading (%s)\n", - fname, strerror(errno)); - if (chk_flag) - exit(-1); - } - } - return fptr; -} diff --git a/tools/gemBS_plugins/utils.h b/tools/gemBS_plugins/utils.h deleted file mode 100644 index 7e4ae84f..00000000 --- a/tools/gemBS_plugins/utils.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef UTILS_H_ -#define UTILS_H_ - -#include - -#define DEFAULT_PATH "/bin:/usr/bin:/usr/local/bin"; -#define READ 0 -#define WRITE 1 - -#ifndef __unused__ -#if defined(__GNUC__) -# define __unused__ __attribute__((unused)) -#else -# define __unused__ -#endif -#endif - -typedef struct { - char **toks; - int n_tok; - int size; -} tokens; - -void qstrip(char *s); -char *find_prog(const char *prog, const char *path); -tokens *tokenize(char *s, const int ch, tokens *tok); -FILE *_open_readfile(const char *fname, bool *flag, bool chk_flag); -int child_open_rw(int fd[2],const char *filterprog,char *const argv[]); -int child_open(const int read_flag,const char *fname,const char *filterprog); - -#define free_tokens(x) \ - { \ - free((x)->toks); \ - free(x); \ - } -#define open_readfile_and_check(a, b) _open_readfile((a), (b), true) -#define open_readfile(a, b) _open_readfile((a), (b), false) - -#endif /* UTILS_H */ diff --git a/tools/utils/readNameClean/readNameClean.c b/tools/utils/readNameClean/readNameClean.c new file mode 100644 index 00000000..29d0817f --- /dev/null +++ b/tools/utils/readNameClean/readNameClean.c @@ -0,0 +1,139 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#include "utils.h" +#include "uthash.h" + +// Strip illegal characters from Read IDs in SAM file +// Valid characters are [!-?A-~] + +// Option to edit SAM headers, adding extra information to the @SQ lines + +#define NUM_SQTAGS 4 +static char *sqtags[NUM_SQTAGS] = { + "LN", "M5", "AS", "SP" +}; + +typedef struct { + char *name; + char *tags[NUM_SQTAGS]; + UT_hash_handle hh; +} ctg_t; + +ctg_t *process_ctg_file(char *name) { + ctg_t *ctgs = NULL; + bool flag; + FILE *fp = open_readfile(name, &flag); + if(fp == NULL) { + fprintf(stderr, "Could not open %s for reading\n", name); + exit(-1); + } + char *buf = NULL; + size_t buf_size = 0, tlen = 0; + ssize_t l; + tokens *tok = NULL; + while(1) { + l = getline(&buf, &buf_size, fp); + if(l < 0) break; + tok = tokenize(buf, '\t', tok); + if(tok->n_tok > 1) { + ctg_t *ct = NULL; + HASH_FIND_STR(ctgs, tok->toks[0], ct); + if(ct != NULL) { + fprintf(stderr, "process_ctg_file(): error - duplicate contig %s\n", tok->toks[0]); + exit(-1); + } + ct = malloc(sizeof(ctg_t)); + ct->name = strdup(tok->toks[0]); + for(int i = 0; i < NUM_SQTAGS; i++) ct->tags[i] = NULL; + for(int i = 1; i < tok->n_tok; i++) { + const char * const p = tok->toks[i]; + for(int j = 0; j < NUM_SQTAGS; j++) { + if(!strncmp(p, sqtags[j], 2) && p[2] == ':') { + ct->tags[j] = strdup(p + 3); + break; + } + } + } + HASH_ADD_KEYPTR(hh, ctgs, ct->name, strlen(ct->name), ct); + } + } + fclose(fp); + if(flag) while(waitpid(-1, NULL, 0) > 0); + if(tok != NULL) free_tokens(tok); + if(buf != NULL) free(buf); + return ctgs; +} + +int main(int argc, char *argv[]) { + FILE *fp = stdin; + char *buf = NULL; + size_t buf_size = 0; + ssize_t l; + ctg_t *ctgs = NULL; + + if(argc > 1) ctgs = process_ctg_file(argv[1]); + // Process header lines - no conversion + while(1) { + l = getline(&buf, &buf_size, fp); + if(l < 0) return 0; + if(buf[0] != '@') break; + bool pflag = true; + if(l > 8 && !strncmp(buf + 1, "SQ\tSN:", 6)) { + char *p = buf + 7; + char *p1 = p; + while(*p1 && *p1 != '\t' && *p1 != '\n') p1++; + size_t l = p1 - p; + ctg_t *ct = NULL; + HASH_FIND(hh, ctgs, p, l, ct); + if(ct) { + pflag = false; + int mask = 0; + char c = *p1; + *p1 = 0; + fputs(buf, stdout); + while(c == '\t') { + p1++; + p = p1; + for(int j = 0; j < NUM_SQTAGS; j++) { + if(!strncmp(p1, sqtags[j], 2) && p1[2] == ':') { + mask |= (1 << j); + break; + } + } + while(*p1 && *p1 != '\t' && *p1 != '\n') p1++; + c = *p1; + *p1 = 0; + printf("\t%s", p); + } + int j = 0; + for(int j = 0; j < NUM_SQTAGS; j++) { + if(ct->tags[j] != NULL && !(mask & (1 << j))) printf("\t%s:%s", sqtags[j], ct->tags[j]); + } + fputc('\n', stdout); + } + } + if(pflag) fputs(buf, stdout); + } + // Process the rest of the file + while(l >= 0) { + int i; + bool found = false; + for(i = 0; i < l && buf[i] != '\t'; i++) if((found = (buf[i] == '@' || buf[i] < '!' || buf[i] > '~'))) break; + if(found) { + int j = i; + for(i = i + 1; i < l && buf[i] != '\t'; i++) { + if(buf[i] != '@' && buf[i] >= '!' && buf[i] <= '~') buf[j++] = buf[i]; + } + for(; i <= l; i++) buf[j++] = buf[i]; + } + fputs(buf, stdout); + l = getline(&buf, &buf_size, fp); + } + if(buf) free(buf); + return 0; +} From ad64659a0e506c27d198e8bf6221607d2e98609e Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sun, 26 Jan 2020 11:07:55 +0100 Subject: [PATCH 31/61] Switch to use new snpxtr utility --- gemBS/__init__.py | 21 ++++++--------------- gemBS/commands.py | 3 --- gemBS/production.py | 2 +- setup.py | 11 ++--------- 4 files changed, 9 insertions(+), 28 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 14aa1bec..1fd43ad3 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -1227,14 +1227,17 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal process = run_tools([mextr], name="Methylation Extraction", logfile=logfile) if snps: - bcftools = [executables['bcftools'],'view','-R',contig_bed,'-Ou',bcfFile] - snpxtr = [executables['bcftools'],'+snpxtr','--','-z','-o',outbase + '_snps.txt.gz'] + snpxtr = [executables['snpxtr'],'-zmx','-o',outbase + '_snps.txt.gz'] if snp_list: snpxtr.extend(['-s',snp_list]) if snp_db: snpxtr.extend(['-D',snp_db]) + if extract_threads: + snpxtr.extend(['-@', extract_threads]) + + snpxtr.append(bcfFile); snp_logfile = os.path.join(output_dir,"snpxtr_{}.err".format(name)) - process_snp = run_tools([bcftools, snpxtr], name="SNP Extraction",logfile=snp_logfile) + process_snp = run_tools([snpxtr], name="SNP Extraction",logfile=snp_logfile) if process_snp.wait() != 0: raise ValueError("Error while extracting SNP calls.") @@ -1244,18 +1247,6 @@ def methylationFiltering(bcfFile=None,outbase=None,name=None,strand_specific=Fal os.remove(contig_bed) - if snps: - tfile = "{}_snp.txt.gz.tbi".format(outbase) - if os.path.exists(tfile): - os.remove(tfile) - logfile = os.path.join(output_dir,"tabix_{}_snps.err".format(name)) - tabix = [executables['tabix'], '-S', '1', '-s' '1', '-b', '2', '-e', '2', "{}_snps.txt.gz".format(outbase)] - snp_idx_proc = run_tools([tabix],name="Index SNP files", logfile=logfile) - - if snps: - if snp_idx_proc.wait() != 0: - raise ValueError("Error while indexing SNP calls.") - return os.path.abspath(output_dir) def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None,benchmark_mode=False): diff --git a/gemBS/commands.py b/gemBS/commands.py index ef1cbafe..3db936dc 100644 --- a/gemBS/commands.py +++ b/gemBS/commands.py @@ -97,9 +97,6 @@ def gemBS_main(): parser.add_argument('-j', '--json-file', dest="json", help="Location of gemBS JSON file") parser.add_argument('-d', '--dir', dest="wd", metavar="DIR",help="Set working directory") - if pkg_resources.resource_exists("gemBS", "libexec/bcftools"): - f = pkg_resources.resource_filename("gemBS", "libexec/bcftools") - os.environ["BCFTOOLS_PLUGINS"] = f if pkg_resources.resource_exists("gemBS", "bin"): f = pkg_resources.resource_filename("gemBS", "bin") path = os.environ.get("PATH") diff --git a/gemBS/production.py b/gemBS/production.py index cbce5e62..8b8e3465 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -1527,7 +1527,7 @@ def do_filter(self, v): if self.snps and not(sm & 768): snps = True - files.extend([filebase + '_snps.txt.gz', filebase + '_snps.txt.gz_tbi']) + files.extend([filebase + '_snps.txt.gz', filebase + '_snps.txt.gz_tbi', filebase + '_snps.txt.gz.md5']) if self.dry_run or self.dry_run_json: args = self.args diff --git a/setup.py b/setup.py index f1f65e02..b095d25d 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ def _install_bundle(install_dir, inst): os.mkdir(gemBSbin_dir) # copy tools/bin - bins = ['gemBS_cat', 'readNameClean', 'md5_fasta', 'mextr'] + bins = ['gemBS_cat', 'readNameClean', 'md5_fasta', 'mextr', 'snpxtr'] for file in bins: f = os.path.join('tools/bin', file) if os.path.exists(f): @@ -101,10 +101,9 @@ def _install_bundle(install_dir, inst): # copy samtools, bcftools and config files bin_dir = os.path.join(install_dir, "bin") lib_dir = os.path.join(install_dir, "lib") - plugins_dir = os.path.join(install_dir, "libexec", "bcftools") etc_dir = os.path.join(install_dir, "etc") config_dir = os.path.join(etc_dir, "gemBS_configs") - for dir in [bin_dir, lib_dir, plugins_dir, config_dir]: + for dir in [bin_dir, lib_dir, config_dir]: if not os.path.exists(dir): os.makedirs(dir) if not (inst.minimal or inst.no_samtools): @@ -129,12 +128,6 @@ def _install_bundle(install_dir, inst): # print ("Copy binary: bcftools to {}".format(bin_dir)) shutil.copy("tools/bcftools/bcftools", bin_dir) os.chmod(os.path.join(bin_dir, "bcftools"), 0o755) - plugins = [x for x in glob.glob("tools/bcftools/plugins/*.so")] - for file in plugins: - # print ("Copy plugin: {} to {}".format(file, plugins_dir)) - shutil.copy(file, plugins_dir) - os.chmod(os.path.join(plugins_dir,os.path.basename(file)), 0o755) - files = [x for x in os.listdir("gemBS/etc")] for file in files: if os.path.isfile(os.path.join("gemBS/etc",file)): From 6e4dda8ac2cf22645c213d084bb8ea19f6a8398f Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 27 Jan 2020 18:01:16 +0100 Subject: [PATCH 32/61] Remove spurious space after end feature coordinate (col 3) in strand specific CpG output file --- tools/utils/mextr/output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/utils/mextr/output.c b/tools/utils/mextr/output.c index 66b9fc10..260457d7 100644 --- a/tools/utils/mextr/output.c +++ b/tools/utils/mextr/output.c @@ -106,7 +106,7 @@ void output_cpg(args_t *const args, rec_t ** const lrec, const int idx) { for(int pos = 0; pos < 2; pos++) { rec_t *rec = lrec[idx ^ pos]; int *mq_p = rec->tags[FMT_MQ].ne == ns ? rec->tags[FMT_MQ].dat_p : NULL; - ksprintf(ks_clear(s), "%s\t%" PRId64 "\t%" PRId64 " \t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, rec->ref); + ksprintf(ks_clear(s), "%s\t%" PRId64 "\t%" PRId64 "\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, rec->ref); char *cx_p = rec->tags[FMT_CX].dat_p; for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { gt_meth *g = rec->sample_gt+ix; From 7256070f8ed0b01dfd53601f40fa05d7497df542 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 27 Jan 2020 18:04:40 +0100 Subject: [PATCH 33/61] Remove spurious space in strand specific CpG output file (backport from devel) --- tools/gemBS_plugins/output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/gemBS_plugins/output.c b/tools/gemBS_plugins/output.c index e8548d5a..868eaed8 100644 --- a/tools/gemBS_plugins/output.c +++ b/tools/gemBS_plugins/output.c @@ -130,7 +130,7 @@ void output_cpg(args_t *args, bcf1_t *rec, fmt_field_t *tags, gt_meth *sample_gt } else { for(int pos = 0; pos < 2; pos++) { int *mq_p = tags[FMT_MQ].st[idx ^ pos].ne == ns ? tags[FMT_MQ].st[idx ^ pos].dat_p : NULL; - fprintf(fp,"%s\t%" PRId64 "\t%" PRId64 " \t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); + fprintf(fp,"%s\t%" PRId64 "\t%" PRId64 "\t%c", args->hdr->id[BCF_DT_CTG][rec->rid].key, rec->pos + pos, rec->pos + pos + 1, cx_len >= 3 + pos ? cx[2 + pos] : '.'); char *cx_p = tags[FMT_CX].st[idx].dat_p; for(int ix = 0; ix < ns; ix++, cx_p += cx_sz) { gt_meth *g = sample_gt[idx ^ pos]+ix; From 9ed9206325d64898fd3e20770ff44c24d4a47015 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 29 Jan 2020 18:04:50 +0100 Subject: [PATCH 34/61] Pull in bug fix from bs_call and bump version --- README.md | 1 + gemBS/version.py | 2 +- tools/bs_call | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b364e6b4..98b6f3e5 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ Documentation can be found at ---------- Changelog: ---------- + 3.4.5 Fix crash when using the -k (keep-mismatch) flag, and fix rare hangs at end of processing 3.4.4 Sort input bcf files to bcftools concat stage to ensure reproducibility. 3.4.4 Add extra sort keys when generating pools to ensure stability of pool membership in the event of multiple contigs having the same size diff --git a/gemBS/version.py b/gemBS/version.py index 306c9ba5..728614e9 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "4" -__VERSION_SUBMINOR = "4" +__VERSION_SUBMINOR = "5" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/tools/bs_call b/tools/bs_call index c6fd9d3d..273e6849 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit c6fd9d3dd75cff4def78a88db96c3e66e0b94934 +Subproject commit 273e684950e343c5e33bb85ced4c609c10e22ec3 From 41bc143b59d6d9ebf451f0abef39b594f908f8e1 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 29 Jan 2020 18:24:26 +0100 Subject: [PATCH 35/61] New version of bs_call --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index c6fd9d3d..d2be349e 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit c6fd9d3dd75cff4def78a88db96c3e66e0b94934 +Subproject commit d2be349eae8a3ce58d7d11b857cba9e6716ba208 From 398e2e5d38c339a96e2ce8e47fa357946ef8ad9d Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Thu, 30 Jan 2020 06:52:25 +0100 Subject: [PATCH 36/61] Add IHEC tagged Singularity recipe --- IHEC/Singularity.ihec | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 IHEC/Singularity.ihec diff --git a/IHEC/Singularity.ihec b/IHEC/Singularity.ihec new file mode 100644 index 00000000..0ab67a46 --- /dev/null +++ b/IHEC/Singularity.ihec @@ -0,0 +1,21 @@ +BootStrap: docker +From: ubuntu:xenial + +%runscript + exec /usr/local/bin/gemBS $@ + +%help + gemBS singularity container + +%post + (mkdir /ext && cd /ext && mkdir disk1 disk2 disk3 disk4 disk5 disk6 disk7 disk8 disk9) + apt-get update + apt-get install -y libpng-dev uuid-dev libmysqlclient-dev + apt-get install -y python3 build-essential git python3-pip wget pigz + apt-get install -y zlib1g-dev libbz2-dev gsl-bin libgsl0-dev + apt-get install -y libncurses5-dev liblzma-dev libssl-dev libcurl4-openssl-dev + pip3 install 'matplotlib<3.0' multiprocess + mkdir /usr/local/build; cd /usr/local/build + git clone --recursive https://github.com/heathsc/gemBS.git + (cd gemBS; python3 setup.py install) + rm -rf gemBS && cd && rmdir /usr/local/build From d828a635a07feb42e698073d5ba6cd25e7329df0 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sun, 9 Feb 2020 07:57:21 +0100 Subject: [PATCH 37/61] General fixes to use new snp extractor --- gemBS/__init__.py | 31 +- gemBS/production.py | 4 +- tools/bs_call | 2 +- tools/utils/common/dbSNP.c | 56 +- tools/utils/common/uthash.h | 2 +- tools/utils/mextr/bbi.c | 5 +- tools/utils/mextr/command_line.c | 2 +- tools/utils/resources/uthash.h | 1150 ++++++++++++++++++++++++++++++ tools/utils/snpxtr/snpxtr.c | 13 +- 9 files changed, 1217 insertions(+), 48 deletions(-) create mode 100644 tools/utils/resources/uthash.h diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 1fd43ad3..3d1ee24a 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -673,8 +673,7 @@ def dbSNP_index(list_dbSNP_files=[],dbsnp_index=""): return os.path.abspath(dbsnp_index) -def makeChromSizes(index_name=None,output=None): - +def makeChromSizes(index_name=None,output=None, omit=[]): index_base = index_name[:-4] if index_name.endswith('.gem') else index_name print(index_name, index_base) @@ -693,8 +692,16 @@ def makeChromSizes(index_name=None,output=None): if new_sz > sz: chrom_sizes[chr] = new_sz else: chrom_sizes[chr] = new_sz + for pattern in omit: + if pattern == "": continue + r = re.compile(fnmatch.translate(pattern)) + for c in list(chrom_sizes.keys()): + if r.search(c): + del chrom_sizes[c] + with open(output, "w") as f: for chr, size in [(c, chrom_sizes[c]) for c in sorted(chrom_sizes, key=chrom_sizes.get, reverse=True)]: + f.write("{}\t{}\n".format(chr,size)) return os.path.abspath(output) else: @@ -855,7 +862,7 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc class BsCaller: def __init__(self,reference,species,right_trim=0,left_trim=5,keep_unmatched=False, - keep_duplicates=False,ignore_duplicates=False,contig_size=None,dbSNP_index_file="", + keep_duplicates=False,ignore_duplicates=False,contig_size=None,csizes=None,dbSNP_index_file="", call_threads="1",merge_threads="1",mapq_threshold=None,bq_threshold=None, haploid=False,conversion=None,ref_bias=None,sample_conversion=None,benchmark_mode=False): self.reference = reference @@ -875,6 +882,7 @@ def __init__(self,reference,species,right_trim=0,left_trim=5,keep_unmatched=Fals self.ref_bias = ref_bias self.sample_conversion = sample_conversion self.contig_size = contig_size + self.csizes = csizes self.benchmark_mode = benchmark_mode def prepare(self, sample, input_bam, chrom_list, output_bcf, report_file, contig_bed): @@ -883,10 +891,10 @@ def prepare(self, sample, input_bam, chrom_list, output_bcf, report_file, contig for chrom in chrom_list: f.write("{}\t0\t{}\n".format(chrom, str(self.contig_size[chrom]))) - parameters_bscall = ['%s' %(executables["bs_call"]),'-r',self.reference,'-n',sample,'--contig-bed',contig_bed,'--report-file',report_file] + parameters_bscall = ['%s' %(executables["bs_call"]),'-r',self.reference,'-n',sample,'--contig-bed',contig_bed,'--contig-sizes',self.csizes,'--report-file',report_file] parameters_bscall.extend(['--right-trim', str(self.right_trim), '--left-trim', str(self.left_trim)]) - + if self.keep_unmatched: parameters_bscall.append('-k') if self.keep_duplicates: @@ -1151,15 +1159,16 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No ret = c.fetchone() if not ret or ret[2] != 1: raise CommandException("Could not open contig sizes file.") + csizes = ret[0] contig_size = {} - with open (ret[0], "r") as f: + with open (csizes, "r") as f: for line in f: fd = line.split() if(len(fd) > 1): contig_size[fd[0]] = int(fd[1]) bsCall = BsCaller(reference=reference,species=species,right_trim=right_trim,left_trim=left_trim, - keep_unmatched=keep_unmatched,keep_duplicates=keep_duplicates,ignore_duplicates=ignore_duplicates,contig_size=contig_size, + keep_unmatched=keep_unmatched,keep_duplicates=keep_duplicates,ignore_duplicates=ignore_duplicates,contig_size=contig_size,csizes=csizes, dbSNP_index_file=dbSNP_index_file,call_threads=call_threads,merge_threads=merge_threads,mapq_threshold=mapq_threshold,bq_threshold=bq_threshold, haploid=haploid,conversion=conversion,ref_bias=ref_bias,sample_conversion=sample_conversion,benchmark_mode=benchmark_mode) @@ -1267,7 +1276,7 @@ def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None,benchmark_mo logfile = os.path.join(output_dir,"bcf_concat_{}.err".format(sample)) #Concatenation - concat = [executables['bcftools'],'concat','-O','b','-o',bcfSample] + concat = [executables['bcftools'],'concat','-O','b','-n','-o',bcfSample] if threads != None: concat.extend(['--threads', threads]) if benchmark_mode: @@ -1280,7 +1289,11 @@ def bsConcat(list_bcfs=None,sample=None,threads=None,bcfSample=None,benchmark_mo raise ValueError("Error while concatenating bcf calls.") #Indexing - indexing = [executables['bcftools'],'index',bcfSample] + indexing = [executables['bcftools'],'index'] + if threads != None: + indexing.extend(['--threads', threads]) + indexing.append(bcfSample) + #md5sum md5sum = ['md5sum',bcfSample] diff --git a/gemBS/production.py b/gemBS/production.py index 8b8e3465..925ca0ca 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -230,7 +230,9 @@ def run(self, args): if csizes_ok == 1: logging.warning("Contig sizes file {} already exists, skipping indexing".format(csizes)) else: - ret = makeChromSizes(index_name, csizes) + config = jsonData.config + omit = config['calling'].get('omit_contigs', []) + ret = makeChromSizes(index_name, csizes, omit) if ret: logging.gemBS.gt("Contig sizes file done: {}".format(ret)) db.check() diff --git a/tools/bs_call b/tools/bs_call index d2be349e..b6cf5fb4 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit d2be349eae8a3ce58d7d11b857cba9e6716ba208 +Subproject commit b6cf5fb46faa3baf24501a457500864597dd8098 diff --git a/tools/utils/common/dbSNP.c b/tools/utils/common/dbSNP.c index 7557fd36..6f70a208 100644 --- a/tools/utils/common/dbSNP.c +++ b/tools/utils/common/dbSNP.c @@ -55,10 +55,17 @@ dbsnp_header_t *load_dbSNP_header(char * const fname) { hdr->dbSNP_bufsize = td1[1]; ucomp_buf = malloc(td1[1]); comp_buf = malloc(td1[2]); - sz = fread(comp_buf, 1, td1[2], file); - unsigned long size = td1[1]; - if(sz != td1[2]) ok = false; + if(fseek(file, td1[0], SEEK_SET)) ok = false; else { + sz = fread(comp_buf, 1, td1[2], file); + if(sz != td1[2]) ok = false; + else { + sz = fread(td, sizeof(uint32_t), 1, file); + if(sz != 1 || *td != 0xd7278434) ok = false; + } + } + unsigned long size = td1[1]; + if(ok) { int ret = uncompress(ucomp_buf, &size, comp_buf, td1[2]); if(ret) ok = false; } @@ -86,15 +93,17 @@ dbsnp_header_t *load_dbSNP_header(char * const fname) { } } uint32_t min_bin = 0, max_bin = 0; + uint64_t offset = 0; for(int i = 0; ok && i < n_ctgs && p < p1; i++) { - if(p + 8 >= p1) ok = false; + if(p + 16 >= p1) ok = false; else { memcpy(&min_bin, p, sizeof(uint32_t)); memcpy(&max_bin, p + 4, sizeof(uint32_t)); + memcpy(&offset, p + 8, sizeof(uint64_t)); if(max_bin < min_bin) { ok = false; } - else p += 8; + else p += 16; } if(!ok) break; l = strlen(p); @@ -102,6 +111,7 @@ dbsnp_header_t *load_dbSNP_header(char * const fname) { else { ctgs[i].min_bin = min_bin; ctgs[i].max_bin = max_bin; + ctgs[i].file_offset = offset; ctgs[i].name = malloc(l + 1); memcpy(ctgs[i].name, p, l + 1); p += l + 1; @@ -109,26 +119,17 @@ dbsnp_header_t *load_dbSNP_header(char * const fname) { } } } - if(!ok) { - free(hdr); - fclose(file); - return NULL; - } - fseek(file, td1[0], SEEK_SET); - for(int i = 0; i < n_ctgs; i++) { - size_t k = fread(&ctgs[i].file_offset, sizeof(uint64_t), 1, file); - if(k != 1) { - ok = false; - break; - } - dbsnp_ctg_t *ctg; - HASH_FIND(hh, hdr->dbSNP, ctgs[i].name, strlen(ctgs[i].name), ctg); - if(ctg != NULL) { - fprintf(stderr,"Error in dbSNP file - duplicate contigs (%s)\n", ctgs[i].name); - ok = false; - break; + if(ok) { + for(int i = 0; i < n_ctgs; i++) { + dbsnp_ctg_t *ctg; + HASH_FIND(hh, hdr->dbSNP, ctgs[i].name, strlen(ctgs[i].name), ctg); + if(ctg != NULL) { + fprintf(stderr,"Error in dbSNP file - duplicate contigs (%s)\n", ctgs[i].name); + ok = false; + break; + } + HASH_ADD_KEYPTR(hh, hdr->dbSNP, ctgs[i].name, strlen(ctgs[i].name), ctgs + i); } - HASH_ADD_KEYPTR(hh, hdr->dbSNP, ctgs[i].name, strlen(ctgs[i].name), ctgs + i); } if(comp_buf) free(comp_buf); if(ucomp_buf) free(ucomp_buf); @@ -184,7 +185,7 @@ bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg) { void * const ucomp_buf = malloc(hdr->dbSNP_bufsize); size_t comp_buf_size = 1 + hdr->dbSNP_bufsize * .75; void *comp_buf = malloc(comp_buf_size); - fseek(file, ctg->file_offset, SEEK_SET); + if(fseek(file, ctg->file_offset, SEEK_SET)) ok = false; uint16_t *entries = malloc(sizeof(uint16_t) * 64); uint8_t *name_buf = malloc(sizeof(uint8_t) * 256 * 64); int n_snps = 0, n_bins = 0; @@ -250,10 +251,7 @@ bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg) { } if(!ok) break; curr_bin += bin_inc; - if(curr_bin > ctg->max_bin || bp >= bp_end) { - ok = false; - break; - } + if(curr_bin > ctg->max_bin || bp >= bp_end) break; bins += bin_inc; } uint8_t x = *bp++; diff --git a/tools/utils/common/uthash.h b/tools/utils/common/uthash.h index b017556b..5b3cd175 120000 --- a/tools/utils/common/uthash.h +++ b/tools/utils/common/uthash.h @@ -1 +1 @@ -../../bs_call/include/uthash.h \ No newline at end of file +../resources/uthash.h \ No newline at end of file diff --git a/tools/utils/mextr/bbi.c b/tools/utils/mextr/bbi.c index c702d142..1aa0eecf 100644 --- a/tools/utils/mextr/bbi.c +++ b/tools/utils/mextr/bbi.c @@ -447,8 +447,8 @@ void *bbi_compress_thread(void *p) { size_t comp_buf_size = 0; cblock_buffer_t * const cbuf = &args->cblock_buf; const int nb = cbuf->n_cblocks; + pthread_mutex_lock(&cbuf->mut); for(;;) { - pthread_mutex_lock(&cbuf->mut); for(;;) { for(int i = 0; i < nb; i++, idx = (idx + 1) % nb) if(cbuf->cblocks[idx].state == cblock_uncompressed) break; if(cbuf->cblocks[idx].state == cblock_uncompressed || cbuf->end_of_input) break; @@ -475,6 +475,7 @@ void *bbi_compress_thread(void *p) { ks_resize(buf, (size_t)compress_size); memcpy(buf->s, comp_buf, (size_t)compress_size); buf->l = (size_t)compress_size; + pthread_mutex_lock(&cbuf->mut); cb->state = cblock_compressed; pthread_cond_signal(&cbuf->cond[2]); idx = (idx + 1) % nb; @@ -533,8 +534,8 @@ void finish_bb_block(args_t * const args, const int ctg_id, const int ix) { bp->ctg_id = ctg_id; bp->block_idx = bdata->block_idx - 1; bp->state = cblock_uncompressed; - pthread_mutex_unlock(&cbuf->mut); pthread_cond_signal(&cbuf->cond[1]); + pthread_mutex_unlock(&cbuf->mut); cbuf->pos = (pos + 1) % cbuf->n_cblocks; if(gdata->first_time) { gdata->first_ctg = ctg_id; diff --git a/tools/utils/mextr/command_line.c b/tools/utils/mextr/command_line.c index ee435376..2b0ce0dc 100644 --- a/tools/utils/mextr/command_line.c +++ b/tools/utils/mextr/command_line.c @@ -164,7 +164,7 @@ void handle_command_line(int argc, char *argv[], args_t * const args) { int c; bool regions_file = false; char *regions_list = NULL; - while ((c = getopt_long(argc, argv, "?QDxh:o:c:b:n:r:s:w:@:m:R:M:I:S:p:B:N:T:t:gzHah?",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "Q:Dxo:c:b:n:r:s:w:@:m:R:M:I:S:p:B:N:T:t:gzHah?",loptions,NULL)) >= 0) { switch (c) { case 'o': args->cpgfilename = optarg; diff --git a/tools/utils/resources/uthash.h b/tools/utils/resources/uthash.h new file mode 100644 index 00000000..5e5866a3 --- /dev/null +++ b/tools/utils/resources/uthash.h @@ -0,0 +1,1150 @@ +/* +Copyright (c) 2003-2018, Troy D. Hanson http://troydhanson.github.com/uthash/ +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTHASH_H +#define UTHASH_H + +#define UTHASH_VERSION 2.1.0 + +#include /* memcmp, memset, strlen */ +#include /* ptrdiff_t */ +#include /* exit */ + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ source) this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#if !defined(DECLTYPE) && !defined(NO_DECLTYPE) +#if defined(_MSC_VER) /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#endif +#elif defined(__BORLANDC__) || defined(__ICCARM__) || defined(__LCC__) || defined(__WATCOMC__) +#define NO_DECLTYPE +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE(x) +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + char **_da_dst = (char**)(&(dst)); \ + *_da_dst = (char*)(src); \ +} while (0) +#else +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + (dst) = DECLTYPE(dst)(src); \ +} while (0) +#endif + +/* a number of the hash function use uint32_t which isn't defined on Pre VS2010 */ +#if defined(_WIN32) +#if defined(_MSC_VER) && _MSC_VER >= 1600 +#include +#elif defined(__WATCOMC__) || defined(__MINGW32__) || defined(__CYGWIN__) +#include +#else +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +#endif +#elif defined(__GNUC__) && !defined(__VXWORKS__) +#include +#else +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +#endif + +#ifndef uthash_malloc +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#endif +#ifndef uthash_free +#define uthash_free(ptr,sz) free(ptr) /* free fcn */ +#endif +#ifndef uthash_bzero +#define uthash_bzero(a,n) memset(a,'\0',n) +#endif +#ifndef uthash_strlen +#define uthash_strlen(s) strlen(s) +#endif + +#ifdef uthash_memcmp +/* This warning will not catch programs that define uthash_memcmp AFTER including uthash.h. */ +#warning "uthash_memcmp is deprecated; please use HASH_KEYCMP instead" +#else +#define uthash_memcmp(a,b,n) memcmp(a,b,n) +#endif + +#ifndef HASH_KEYCMP +#define HASH_KEYCMP(a,b,n) uthash_memcmp(a,b,n) +#endif + +#ifndef uthash_noexpand_fyi +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#endif +#ifndef uthash_expand_fyi +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ +#endif + +#ifndef HASH_NONFATAL_OOM +#define HASH_NONFATAL_OOM 0 +#endif + +#if HASH_NONFATAL_OOM +/* malloc failures can be recovered from */ + +#ifndef uthash_nonfatal_oom +#define uthash_nonfatal_oom(obj) do {} while (0) /* non-fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) do { (oomed) = 1; } while (0) +#define IF_HASH_NONFATAL_OOM(x) x + +#else +/* malloc failures result in lost memory, hash tables are unusable */ + +#ifndef uthash_fatal +#define uthash_fatal(msg) exit(-1) /* fatal OOM error */ +#endif + +#define HASH_RECORD_OOM(oomed) uthash_fatal("out of memory") +#define IF_HASH_NONFATAL_OOM(x) + +#endif + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32U /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 5U /* lg2 of initial number of buckets */ +#define HASH_BKT_CAPACITY_THRESH 10U /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhp */ +#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) +/* calculate the hash handle from element address elp */ +#define HH_FROM_ELMT(tbl,elp) ((UT_hash_handle*)(void*)(((char*)(elp)) + ((tbl)->hho))) + +#define HASH_ROLLBACK_BKT(hh, head, itemptrhh) \ +do { \ + struct UT_hash_handle *_hd_hh_item = (itemptrhh); \ + unsigned _hd_bkt; \ + HASH_TO_BKT(_hd_hh_item->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + (head)->hh.tbl->buckets[_hd_bkt].count++; \ + _hd_hh_item->hh_next = NULL; \ + _hd_hh_item->hh_prev = NULL; \ +} while (0) + +#define HASH_VALUE(keyptr,keylen,hashv) \ +do { \ + HASH_FCN(keyptr, keylen, hashv); \ +} while (0) + +#define HASH_FIND_BYHASHVALUE(hh,head,keyptr,keylen,hashval,out) \ +do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_bkt; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, hashval) != 0) { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], keyptr, keylen, hashval, out); \ + } \ + } \ +} while (0) + +#define HASH_FIND(hh,head,keyptr,keylen,out) \ +do { \ + (out) = NULL; \ + if (head) { \ + unsigned _hf_hashv; \ + HASH_VALUE(keyptr, keylen, _hf_hashv); \ + HASH_FIND_BYHASHVALUE(hh, head, keyptr, keylen, _hf_hashv, out); \ + } \ +} while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1UL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8UL) + (((HASH_BLOOM_BITLEN%8UL)!=0UL) ? 1UL : 0UL) +#define HASH_BLOOM_MAKE(tbl,oomed) \ +do { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!(tbl)->bloom_bv) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ + } \ +} while (0) + +#define HASH_BLOOM_FREE(tbl) \ +do { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ +} while (0) + +#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8U] |= (1U << ((idx)%8U))) +#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8U] & (1U << ((idx)%8U))) + +#define HASH_BLOOM_ADD(tbl,hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#define HASH_BLOOM_TEST(tbl,hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, ((hashv) & (uint32_t)((1UL << (tbl)->bloom_nbits) - 1U))) + +#else +#define HASH_BLOOM_MAKE(tbl,oomed) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl,hashv) +#define HASH_BLOOM_TEST(tbl,hashv) (1) +#define HASH_BLOOM_BYTELEN 0U +#endif + +#define HASH_MAKE_TABLE(hh,head,oomed) \ +do { \ + (head)->hh.tbl = (UT_hash_table*)uthash_malloc(sizeof(UT_hash_table)); \ + if (!(head)->hh.tbl) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ + if (!(head)->hh.tbl->buckets) { \ + HASH_RECORD_OOM(oomed); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } else { \ + uthash_bzero((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS * sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl, oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (oomed) { \ + uthash_free((head)->hh.tbl->buckets, \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + } \ + ) \ + } \ + } \ +} while (0) + +#define HASH_REPLACE_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,replaced,cmpfcn) \ +do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn); \ +} while (0) + +#define HASH_REPLACE_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add,replaced) \ +do { \ + (replaced) = NULL; \ + HASH_FIND_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, replaced); \ + if (replaced) { \ + HASH_DELETE(hh, head, replaced); \ + } \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add); \ +} while (0) + +#define HASH_REPLACE(hh,head,fieldname,keylen_in,add,replaced) \ +do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced); \ +} while (0) + +#define HASH_REPLACE_INORDER(hh,head,fieldname,keylen_in,add,replaced,cmpfcn) \ +do { \ + unsigned _hr_hashv; \ + HASH_VALUE(&((add)->fieldname), keylen_in, _hr_hashv); \ + HASH_REPLACE_BYHASHVALUE_INORDER(hh, head, fieldname, keylen_in, _hr_hashv, add, replaced, cmpfcn); \ +} while (0) + +#define HASH_APPEND_LIST(hh, head, add) \ +do { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail->next = (add); \ + (head)->hh.tbl->tail = &((add)->hh); \ +} while (0) + +#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn) \ +do { \ + do { \ + if (cmpfcn(DECLTYPE(head)(_hs_iter), add) > 0) { \ + break; \ + } \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ +} while (0) + +#ifdef NO_DECLTYPE +#undef HASH_AKBI_INNER_LOOP +#define HASH_AKBI_INNER_LOOP(hh,head,add,cmpfcn) \ +do { \ + char *_hs_saved_head = (char*)(head); \ + do { \ + DECLTYPE_ASSIGN(head, _hs_iter); \ + if (cmpfcn(head, add) > 0) { \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + break; \ + } \ + DECLTYPE_ASSIGN(head, _hs_saved_head); \ + } while ((_hs_iter = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->next)); \ +} while (0) +#endif + +#if HASH_NONFATAL_OOM + +#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed) \ +do { \ + if (!(oomed)) { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + if (oomed) { \ + HASH_ROLLBACK_BKT(hh, head, &(add)->hh); \ + HASH_DELETE_HH(hh, head, &(add)->hh); \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } else { \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ + } \ + } else { \ + (add)->hh.tbl = NULL; \ + uthash_nonfatal_oom(add); \ + } \ +} while (0) + +#else + +#define HASH_ADD_TO_TABLE(hh,head,keyptr,keylen_in,hashval,add,oomed) \ +do { \ + unsigned _ha_bkt; \ + (head)->hh.tbl->num_items++; \ + HASH_TO_BKT(hashval, (head)->hh.tbl->num_buckets, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt], hh, &(add)->hh, oomed); \ + HASH_BLOOM_ADD((head)->hh.tbl, hashval); \ + HASH_EMIT_KEY(hh, head, keyptr, keylen_in); \ +} while (0) + +#endif + + +#define HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh,head,keyptr,keylen_in,hashval,add,cmpfcn) \ +do { \ + IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; ) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (char*) (keyptr); \ + (add)->hh.keylen = (unsigned) (keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( } ) \ + } else { \ + void *_hs_iter = (head); \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_AKBI_INNER_LOOP(hh, head, add, cmpfcn); \ + if (_hs_iter) { \ + (add)->hh.next = _hs_iter; \ + if (((add)->hh.prev = HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev)) { \ + HH_FROM_ELMT((head)->hh.tbl, (add)->hh.prev)->next = (add); \ + } else { \ + (head) = (add); \ + } \ + HH_FROM_ELMT((head)->hh.tbl, _hs_iter)->prev = (add); \ + } else { \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE_INORDER"); \ +} while (0) + +#define HASH_ADD_KEYPTR_INORDER(hh,head,keyptr,keylen_in,add,cmpfcn) \ +do { \ + unsigned _hs_hashv; \ + HASH_VALUE(keyptr, keylen_in, _hs_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, keyptr, keylen_in, _hs_hashv, add, cmpfcn); \ +} while (0) + +#define HASH_ADD_BYHASHVALUE_INORDER(hh,head,fieldname,keylen_in,hashval,add,cmpfcn) \ + HASH_ADD_KEYPTR_BYHASHVALUE_INORDER(hh, head, &((add)->fieldname), keylen_in, hashval, add, cmpfcn) + +#define HASH_ADD_INORDER(hh,head,fieldname,keylen_in,add,cmpfcn) \ + HASH_ADD_KEYPTR_INORDER(hh, head, &((add)->fieldname), keylen_in, add, cmpfcn) + +#define HASH_ADD_KEYPTR_BYHASHVALUE(hh,head,keyptr,keylen_in,hashval,add) \ +do { \ + IF_HASH_NONFATAL_OOM( int _ha_oomed = 0; ) \ + (add)->hh.hashv = (hashval); \ + (add)->hh.key = (char*) (keyptr); \ + (add)->hh.keylen = (unsigned) (keylen_in); \ + if (!(head)) { \ + (add)->hh.next = NULL; \ + (add)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh, add, _ha_oomed); \ + IF_HASH_NONFATAL_OOM( if (!_ha_oomed) { ) \ + (head) = (add); \ + IF_HASH_NONFATAL_OOM( } ) \ + } else { \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_APPEND_LIST(hh, head, add); \ + } \ + HASH_ADD_TO_TABLE(hh, head, keyptr, keylen_in, hashval, add, _ha_oomed); \ + HASH_FSCK(hh, head, "HASH_ADD_KEYPTR_BYHASHVALUE"); \ +} while (0) + +#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ +do { \ + unsigned _ha_hashv; \ + HASH_VALUE(keyptr, keylen_in, _ha_hashv); \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, keyptr, keylen_in, _ha_hashv, add); \ +} while (0) + +#define HASH_ADD_BYHASHVALUE(hh,head,fieldname,keylen_in,hashval,add) \ + HASH_ADD_KEYPTR_BYHASHVALUE(hh, head, &((add)->fieldname), keylen_in, hashval, add) + +#define HASH_ADD(hh,head,fieldname,keylen_in,add) \ + HASH_ADD_KEYPTR(hh, head, &((add)->fieldname), keylen_in, add) + +#define HASH_TO_BKT(hashv,num_bkts,bkt) \ +do { \ + bkt = ((hashv) & ((num_bkts) - 1U)); \ +} while (0) + +/* delete "delptr" from the hash table. + * "the usual" patch-up process for the app-order doubly-linked-list. + * The use of _hd_hh_del below deserves special explanation. + * These used to be expressed using (delptr) but that led to a bug + * if someone used the same symbol for the head and deletee, like + * HASH_DELETE(hh,users,users); + * We want that to work, but by changing the head (users) below + * we were forfeiting our ability to further refer to the deletee (users) + * in the patch-up process. Solution: use scratch space to + * copy the deletee pointer, then the latter references are via that + * scratch pointer rather than through the repointed (users) symbol. + */ +#define HASH_DELETE(hh,head,delptr) \ + HASH_DELETE_HH(hh, head, &(delptr)->hh) + +#define HASH_DELETE_HH(hh,head,delptrhh) \ +do { \ + struct UT_hash_handle *_hd_hh_del = (delptrhh); \ + if ((_hd_hh_del->prev == NULL) && (_hd_hh_del->next == NULL)) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } else { \ + unsigned _hd_bkt; \ + if (_hd_hh_del == (head)->hh.tbl->tail) { \ + (head)->hh.tbl->tail = HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev); \ + } \ + if (_hd_hh_del->prev != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->prev)->next = _hd_hh_del->next; \ + } else { \ + DECLTYPE_ASSIGN(head, _hd_hh_del->next); \ + } \ + if (_hd_hh_del->next != NULL) { \ + HH_FROM_ELMT((head)->hh.tbl, _hd_hh_del->next)->prev = _hd_hh_del->prev; \ + } \ + HASH_TO_BKT(_hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT((head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh, head, "HASH_DELETE_HH"); \ +} while (0) + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head,findstr,out) \ +do { \ + unsigned _uthash_hfstr_keylen = (unsigned)uthash_strlen(findstr); \ + HASH_FIND(hh, head, findstr, _uthash_hfstr_keylen, out); \ +} while (0) +#define HASH_ADD_STR(head,strfield,add) \ +do { \ + unsigned _uthash_hastr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_ADD(hh, head, strfield[0], _uthash_hastr_keylen, add); \ +} while (0) +#define HASH_REPLACE_STR(head,strfield,add,replaced) \ +do { \ + unsigned _uthash_hrstr_keylen = (unsigned)uthash_strlen((add)->strfield); \ + HASH_REPLACE(hh, head, strfield[0], _uthash_hrstr_keylen, add, replaced); \ +} while (0) +#define HASH_FIND_INT(head,findint,out) \ + HASH_FIND(hh,head,findint,sizeof(int),out) +#define HASH_ADD_INT(head,intfield,add) \ + HASH_ADD(hh,head,intfield,sizeof(int),add) +#define HASH_REPLACE_INT(head,intfield,add,replaced) \ + HASH_REPLACE(hh,head,intfield,sizeof(int),add,replaced) +#define HASH_FIND_PTR(head,findptr,out) \ + HASH_FIND(hh,head,findptr,sizeof(void *),out) +#define HASH_ADD_PTR(head,ptrfield,add) \ + HASH_ADD(hh,head,ptrfield,sizeof(void *),add) +#define HASH_REPLACE_PTR(head,ptrfield,add,replaced) \ + HASH_REPLACE(hh,head,ptrfield,sizeof(void *),add,replaced) +#define HASH_DEL(head,delptr) \ + HASH_DELETE(hh,head,delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. + * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. + */ +#ifdef HASH_DEBUG +#include /* fprintf, stderr */ +#define HASH_OOPS(...) do { fprintf(stderr, __VA_ARGS__); exit(-1); } while (0) +#define HASH_FSCK(hh,head,where) \ +do { \ + struct UT_hash_handle *_thh; \ + if (head) { \ + unsigned _bkt_i; \ + unsigned _count = 0; \ + char *_prev; \ + for (_bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; ++_bkt_i) { \ + unsigned _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) { \ + if (_prev != (char*)(_thh->hh_prev)) { \ + HASH_OOPS("%s: invalid hh_prev %p, actual %p\n", \ + (where), (void*)_thh->hh_prev, (void*)_prev); \ + } \ + _bkt_count++; \ + _prev = (char*)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ + HASH_OOPS("%s: invalid bucket count %u, actual %u\n", \ + (where), (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid hh item count %u, actual %u\n", \ + (where), (head)->hh.tbl->num_items, _count); \ + } \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) { \ + _count++; \ + if (_prev != (char*)_thh->prev) { \ + HASH_OOPS("%s: invalid prev %p, actual %p\n", \ + (where), (void*)_thh->prev, (void*)_prev); \ + } \ + _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = (_thh->next ? HH_FROM_ELMT((head)->hh.tbl, _thh->next) : NULL); \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("%s: invalid app item count %u, actual %u\n", \ + (where), (head)->hh.tbl->num_items, _count); \ + } \ + } \ +} while (0) +#else +#define HASH_FSCK(hh,head,where) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * the descriptor to which this macro is defined for tuning the hash function. + * The app can #include to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ +do { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, (unsigned long)fieldlen); \ +} while (0) +#else +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) +#endif + +/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */ +#ifdef HASH_FUNCTION +#define HASH_FCN HASH_FUNCTION +#else +#define HASH_FCN HASH_JEN +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6. Note (x<<5+x)=x*33. */ +#define HASH_BER(key,keylen,hashv) \ +do { \ + unsigned _hb_keylen = (unsigned)keylen; \ + const unsigned char *_hb_key = (const unsigned char*)(key); \ + (hashv) = 0; \ + while (_hb_keylen-- != 0U) { \ + (hashv) = (((hashv) << 5) + (hashv)) + *_hb_key++; \ + } \ +} while (0) + + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */ +#define HASH_SAX(key,keylen,hashv) \ +do { \ + unsigned _sx_i; \ + const unsigned char *_hs_key = (const unsigned char*)(key); \ + hashv = 0; \ + for (_sx_i=0; _sx_i < keylen; _sx_i++) { \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + } \ +} while (0) +/* FNV-1a variation */ +#define HASH_FNV(key,keylen,hashv) \ +do { \ + unsigned _fn_i; \ + const unsigned char *_hf_key = (const unsigned char*)(key); \ + (hashv) = 2166136261U; \ + for (_fn_i=0; _fn_i < keylen; _fn_i++) { \ + hashv = hashv ^ _hf_key[_fn_i]; \ + hashv = hashv * 16777619U; \ + } \ +} while (0) + +#define HASH_OAT(key,keylen,hashv) \ +do { \ + unsigned _ho_i; \ + const unsigned char *_ho_key=(const unsigned char*)(key); \ + hashv = 0; \ + for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ +} while (0) + +#define HASH_JEN_MIX(a,b,c) \ +do { \ + a -= b; a -= c; a ^= ( c >> 13 ); \ + b -= c; b -= a; b ^= ( a << 8 ); \ + c -= a; c -= b; c ^= ( b >> 13 ); \ + a -= b; a -= c; a ^= ( c >> 12 ); \ + b -= c; b -= a; b ^= ( a << 16 ); \ + c -= a; c -= b; c ^= ( b >> 5 ); \ + a -= b; a -= c; a ^= ( c >> 3 ); \ + b -= c; b -= a; b ^= ( a << 10 ); \ + c -= a; c -= b; c ^= ( b >> 15 ); \ +} while (0) + +#define HASH_JEN(key,keylen,hashv) \ +do { \ + unsigned _hj_i,_hj_j,_hj_k; \ + unsigned const char *_hj_key=(unsigned const char*)(key); \ + hashv = 0xfeedbeefu; \ + _hj_i = _hj_j = 0x9e3779b9u; \ + _hj_k = (unsigned)(keylen); \ + while (_hj_k >= 12U) { \ + _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ + + ( (unsigned)_hj_key[2] << 16 ) \ + + ( (unsigned)_hj_key[3] << 24 ) ); \ + _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ + + ( (unsigned)_hj_key[6] << 16 ) \ + + ( (unsigned)_hj_key[7] << 24 ) ); \ + hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ + + ( (unsigned)_hj_key[10] << 16 ) \ + + ( (unsigned)_hj_key[11] << 24 ) ); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12U; \ + } \ + hashv += (unsigned)(keylen); \ + switch ( _hj_k ) { \ + case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); /* FALLTHROUGH */ \ + case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); /* FALLTHROUGH */ \ + case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); /* FALLTHROUGH */ \ + case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); /* FALLTHROUGH */ \ + case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); /* FALLTHROUGH */ \ + case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); /* FALLTHROUGH */ \ + case 5: _hj_j += _hj_key[4]; /* FALLTHROUGH */ \ + case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); /* FALLTHROUGH */ \ + case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); /* FALLTHROUGH */ \ + case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); /* FALLTHROUGH */ \ + case 1: _hj_i += _hj_key[0]; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ +} while (0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif +#define HASH_SFH(key,keylen,hashv) \ +do { \ + unsigned const char *_sfh_key=(unsigned const char*)(key); \ + uint32_t _sfh_tmp, _sfh_len = (uint32_t)keylen; \ + \ + unsigned _sfh_rem = _sfh_len & 3U; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabeu; \ + \ + /* Main loop */ \ + for (;_sfh_len > 0U; _sfh_len--) { \ + hashv += get16bits (_sfh_key); \ + _sfh_tmp = ((uint32_t)(get16bits (_sfh_key+2)) << 11) ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2U*sizeof (uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) { \ + case 3: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= (uint32_t)(_sfh_key[sizeof (uint16_t)]) << 18; \ + hashv += hashv >> 11; \ + break; \ + case 2: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ +} while (0) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,hashval,out) \ +do { \ + if ((head).hh_head != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (head).hh_head)); \ + } else { \ + (out) = NULL; \ + } \ + while ((out) != NULL) { \ + if ((out)->hh.hashv == (hashval) && (out)->hh.keylen == (keylen_in)) { \ + if (HASH_KEYCMP((out)->hh.key, keyptr, keylen_in) == 0) { \ + break; \ + } \ + } \ + if ((out)->hh.hh_next != NULL) { \ + DECLTYPE_ASSIGN(out, ELMT_FROM_HH(tbl, (out)->hh.hh_next)); \ + } else { \ + (out) = NULL; \ + } \ + } \ +} while (0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head,hh,addhh,oomed) \ +do { \ + UT_hash_bucket *_ha_head = &(head); \ + _ha_head->count++; \ + (addhh)->hh_next = _ha_head->hh_head; \ + (addhh)->hh_prev = NULL; \ + if (_ha_head->hh_head != NULL) { \ + _ha_head->hh_head->hh_prev = (addhh); \ + } \ + _ha_head->hh_head = (addhh); \ + if ((_ha_head->count >= ((_ha_head->expand_mult + 1U) * HASH_BKT_CAPACITY_THRESH)) \ + && !(addhh)->tbl->noexpand) { \ + HASH_EXPAND_BUCKETS(addhh,(addhh)->tbl, oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (oomed) { \ + HASH_DEL_IN_BKT(head,addhh); \ + } \ + ) \ + } \ +} while (0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(head,delhh) \ +do { \ + UT_hash_bucket *_hd_head = &(head); \ + _hd_head->count--; \ + if (_hd_head->hh_head == (delhh)) { \ + _hd_head->hh_head = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_prev) { \ + (delhh)->hh_prev->hh_next = (delhh)->hh_next; \ + } \ + if ((delhh)->hh_next) { \ + (delhh)->hh_next->hh_prev = (delhh)->hh_prev; \ + } \ +} while (0) + +/* Bucket expansion has the effect of doubling the number of buckets + * and redistributing the items into the new buckets. Ideally the + * items will distribute more or less evenly into the new buckets + * (the extent to which this is true is a measure of the quality of + * the hash function as it applies to the key domain). + * + * With the items distributed into more buckets, the chain length + * (item count) in each bucket is reduced. Thus by expanding buckets + * the hash keeps a bound on the chain length. This bounded chain + * length is the essence of how a hash provides constant time lookup. + * + * The calculation of tbl->ideal_chain_maxlen below deserves some + * explanation. First, keep in mind that we're calculating the ideal + * maximum chain length based on the *new* (doubled) bucket count. + * In fractions this is just n/b (n=number of items,b=new num buckets). + * Since the ideal chain length is an integer, we want to calculate + * ceil(n/b). We don't depend on floating point arithmetic in this + * hash, so to calculate ceil(n/b) with integers we could write + * + * ceil(n/b) = (n/b) + ((n%b)?1:0) + * + * and in fact a previous version of this hash did just that. + * But now we have improved things a bit by recognizing that b is + * always a power of two. We keep its base 2 log handy (call it lb), + * so now we can write this with a bit shift and logical AND: + * + * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * + */ +#define HASH_EXPAND_BUCKETS(hh,tbl,oomed) \ +do { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ + 2UL * (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \ + if (!_he_new_buckets) { \ + HASH_RECORD_OOM(oomed); \ + } else { \ + uthash_bzero(_he_new_buckets, \ + 2UL * (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \ + (tbl)->ideal_chain_maxlen = \ + ((tbl)->num_items >> ((tbl)->log2_num_buckets+1U)) + \ + ((((tbl)->num_items & (((tbl)->num_buckets*2U)-1U)) != 0U) ? 1U : 0U); \ + (tbl)->nonideal_items = 0; \ + for (_he_bkt_i = 0; _he_bkt_i < (tbl)->num_buckets; _he_bkt_i++) { \ + _he_thh = (tbl)->buckets[ _he_bkt_i ].hh_head; \ + while (_he_thh != NULL) { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT(_he_thh->hashv, (tbl)->num_buckets * 2U, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[_he_bkt]); \ + if (++(_he_newbkt->count) > (tbl)->ideal_chain_maxlen) { \ + (tbl)->nonideal_items++; \ + if (_he_newbkt->count > _he_newbkt->expand_mult * (tbl)->ideal_chain_maxlen) { \ + _he_newbkt->expand_mult++; \ + } \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head != NULL) { \ + _he_newbkt->hh_head->hh_prev = _he_thh; \ + } \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free((tbl)->buckets, (tbl)->num_buckets * sizeof(struct UT_hash_bucket)); \ + (tbl)->num_buckets *= 2U; \ + (tbl)->log2_num_buckets++; \ + (tbl)->buckets = _he_new_buckets; \ + (tbl)->ineff_expands = ((tbl)->nonideal_items > ((tbl)->num_items >> 1)) ? \ + ((tbl)->ineff_expands+1U) : 0U; \ + if ((tbl)->ineff_expands > 1U) { \ + (tbl)->noexpand = 1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ + } \ +} while (0) + + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) +#define HASH_SRT(hh,head,cmpfcn) \ +do { \ + unsigned _hs_i; \ + unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head != NULL) { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping != 0U) { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p != NULL) { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for (_hs_i = 0; _hs_i < _hs_insize; ++_hs_i) { \ + _hs_psize++; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + if (_hs_q == NULL) { \ + break; \ + } \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize != 0U) || ((_hs_qsize != 0U) && (_hs_q != NULL))) { \ + if (_hs_psize == 0U) { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + _hs_qsize--; \ + } else if ((_hs_qsize == 0U) || (_hs_q == NULL)) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL); \ + } \ + _hs_psize--; \ + } else if ((cmpfcn( \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl, _hs_q)) \ + )) <= 0) { \ + _hs_e = _hs_p; \ + if (_hs_p != NULL) { \ + _hs_p = ((_hs_p->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_p->next) : NULL); \ + } \ + _hs_psize--; \ + } else { \ + _hs_e = _hs_q; \ + _hs_q = ((_hs_q->next != NULL) ? \ + HH_FROM_ELMT((head)->hh.tbl, _hs_q->next) : NULL); \ + _hs_qsize--; \ + } \ + if ( _hs_tail != NULL ) { \ + _hs_tail->next = ((_hs_e != NULL) ? \ + ELMT_FROM_HH((head)->hh.tbl, _hs_e) : NULL); \ + } else { \ + _hs_list = _hs_e; \ + } \ + if (_hs_e != NULL) { \ + _hs_e->prev = ((_hs_tail != NULL) ? \ + ELMT_FROM_HH((head)->hh.tbl, _hs_tail) : NULL); \ + } \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + if (_hs_tail != NULL) { \ + _hs_tail->next = NULL; \ + } \ + if (_hs_nmerges <= 1U) { \ + _hs_looping = 0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head, ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2U; \ + } \ + HASH_FSCK(hh, head, "HASH_SRT"); \ + } \ +} while (0) + +/* This function selects items from one hash into another hash. + * The end result is that the selected items have dual presence + * in both hashes. There is no copy of the items made; rather + * they are added into the new hash through a secondary hash + * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ +do { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt = NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ + ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ + if ((src) != NULL) { \ + for (_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ + for (_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ + _src_hh != NULL; \ + _src_hh = _src_hh->hh_next) { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) { \ + IF_HASH_NONFATAL_OOM( int _hs_oomed = 0; ) \ + _dst_hh = (UT_hash_handle*)(void*)(((char*)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh != NULL) { \ + _last_elt_hh->next = _elt; \ + } \ + if ((dst) == NULL) { \ + DECLTYPE_ASSIGN(dst, _elt); \ + HASH_MAKE_TABLE(hh_dst, dst, _hs_oomed); \ + IF_HASH_NONFATAL_OOM( \ + if (_hs_oomed) { \ + uthash_nonfatal_oom(_elt); \ + (dst) = NULL; \ + continue; \ + } \ + ) \ + } else { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt], hh_dst, _dst_hh, _hs_oomed); \ + (dst)->hh_dst.tbl->num_items++; \ + IF_HASH_NONFATAL_OOM( \ + if (_hs_oomed) { \ + HASH_ROLLBACK_BKT(hh_dst, dst, _dst_hh); \ + HASH_DELETE_HH(hh_dst, dst, _dst_hh); \ + _dst_hh->tbl = NULL; \ + uthash_nonfatal_oom(_elt); \ + continue; \ + } \ + ) \ + HASH_BLOOM_ADD(_dst_hh->tbl, _dst_hh->hashv); \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst, dst, "HASH_SELECT"); \ +} while (0) + +#define HASH_CLEAR(hh,head) \ +do { \ + if ((head) != NULL) { \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head) = NULL; \ + } \ +} while (0) + +#define HASH_OVERHEAD(hh,head) \ + (((head) != NULL) ? ( \ + (size_t)(((head)->hh.tbl->num_items * sizeof(UT_hash_handle)) + \ + ((head)->hh.tbl->num_buckets * sizeof(UT_hash_bucket)) + \ + sizeof(UT_hash_table) + \ + (HASH_BLOOM_BYTELEN))) : 0U) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh,head,el,tmp) \ +for(((el)=(head)), ((*(char**)(&(tmp)))=(char*)((head!=NULL)?(head)->hh.next:NULL)); \ + (el) != NULL; ((el)=(tmp)), ((*(char**)(&(tmp)))=(char*)((tmp!=NULL)?(tmp)->hh.next:NULL))) +#else +#define HASH_ITER(hh,head,el,tmp) \ +for(((el)=(head)), ((tmp)=DECLTYPE(el)((head!=NULL)?(head)->hh.next:NULL)); \ + (el) != NULL; ((el)=(tmp)), ((tmp)=DECLTYPE(el)((tmp!=NULL)?(tmp)->hh.next:NULL))) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh,head) +#define HASH_CNT(hh,head) ((head != NULL)?((head)->hh.tbl->num_items):0U) + +typedef struct UT_hash_bucket { + struct UT_hash_handle *hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If + * the bucket's chain exceeds this length, bucket expansion is triggered). + * However, setting expand_mult to a non-zero value delays bucket expansion + * (that would be triggered by additions to this particular bucket) + * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. + * (The multiplier is simply expand_mult+1). The whole idea of this + * multiplier is to reduce bucket expansions, since they are expensive, in + * situations where we know that a particular bucket tends to be overused. + * It is better to let its chain length grow to a longer yet-still-bounded + * value, than to do an O(n) bucket expansion too often. + */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1u +#define HASH_BLOOM_SIGNATURE 0xb12220f2u + +typedef struct UT_hash_table { + UT_hash_bucket *buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * more than ceil(#items/#buckets) items. that's the ideal chain length. */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * exceeds the ideal chain maxlen. these items pay the penalty for an uneven + * hash distribution; reaching them in a chain traversal takes >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * afterward, more than half the items in the hash had nonideal chain + * positions. If this happens on two consecutive expansions we inhibit any + * further expansion, as it's not helping; this happens when the hash + * function isn't a good fit for the key domain. When expansion is inhibited + * the hash will still work, albeit no longer in constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t *bloom_bv; + uint8_t bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle { + struct UT_hash_table *tbl; + void *prev; /* prev element in app order */ + void *next; /* next element in app order */ + struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle *hh_next; /* next hh in bucket order */ + void *key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ diff --git a/tools/utils/snpxtr/snpxtr.c b/tools/utils/snpxtr/snpxtr.c index 3c17d807..e86641e3 100644 --- a/tools/utils/snpxtr/snpxtr.c +++ b/tools/utils/snpxtr/snpxtr.c @@ -33,10 +33,15 @@ int main(int argc, char **argv) { int ns = bcf_hdr_nsamples(args.hdr); assert(ns > 0); if(args.snplistname) read_snplist(&args); - if(args.dbSNPfilename) args.dbSNP_hdr = load_dbSNP_header(args.dbSNPfilename); -// init_files(&args); - process_input(&args); - fprintf(stderr,"snpxtr: finished\n"); + bool error = false; + if(args.dbSNPfilename) { + args.dbSNP_hdr = load_dbSNP_header(args.dbSNPfilename); + if(!args.dbSNP_hdr) error = true; + } + if(!error) { + process_input(&args); + fprintf(stderr,"snpxtr: finished\n"); + } bcf_sr_destroy(args.sr); return 0; } From c262819dd487648ffed4c39473a923ceeebd8d15 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Tue, 11 Feb 2020 06:55:26 +0100 Subject: [PATCH 38/61] Add support for json dbSNP files to dbSNP_idx --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index b6cf5fb4..309ccf85 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit b6cf5fb46faa3baf24501a457500864597dd8098 +Subproject commit 309ccf855548ee938622da1bbb25c9d92581ac19 From dace960e2e126dcff78d858a28c7e7fac1b19e86 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 12 Feb 2020 08:30:06 +0100 Subject: [PATCH 39/61] New bs_call version with ability to read VCF format dbSNP files --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index 309ccf85..9d5dfb74 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 309ccf855548ee938622da1bbb25c9d92581ac19 +Subproject commit 9d5dfb7420c0c5852db41fdc6f4100f1230b8d44 From 72b2bd1ce245d3d03c822d0dc92e976ca2be37ef Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 12 Feb 2020 08:30:38 +0100 Subject: [PATCH 40/61] Update changes --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index b364e6b4..13308927 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,16 @@ Documentation can be found at ---------- Changelog: ---------- + 3.5.0 Switch bs_call and snpxtr to use the new dbSNP index format + 3.5.0 Add ability of dbSNP to read the new JSON and VCF dbSNP format files + that are now used for human and non-human species respectively + 3.5.0 Add multithreading to dbSNP_idx + 3.5.0 Change format of dbSNP index to allow (a) efficient loading + of SNP data for individual contigs and (b) parallel index creation + 3.5.0 Rewrite mextr and snpxtr as standalone tools rather than + bcftools plugins. Now multithreaded and (relatively) memoryefficient + 3.5.0 Replace bedToBigBed and wigToBigWig to reduce memory usage + and improve speed 3.4.4 Sort input bcf files to bcftools concat stage to ensure reproducibility. 3.4.4 Add extra sort keys when generating pools to ensure stability of pool membership in the event of multiple contigs having the same size From 31ca030d25505345d298d9f7f51ead48c8684101 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 12 Feb 2020 17:44:31 +0100 Subject: [PATCH 41/61] Fix null dereference when dbSNP index not used --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index 9d5dfb74..a99f470f 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 9d5dfb7420c0c5852db41fdc6f4100f1230b8d44 +Subproject commit a99f470f1ec7e65252247a8f39b64ede51dc53ae From 2525a0f340e8bc05c0c4414b6ae8cc470163c6fb Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 12 Feb 2020 21:06:54 +0100 Subject: [PATCH 42/61] Force bs_call to process contig pools in decreasing size order --- README.md | 3 +++ gemBS/__init__.py | 4 ++-- gemBS/__pycache__/__init__.cpython-37.pyc | Bin 0 -> 32627 bytes gemBS/__pycache__/database.cpython-37.pyc | Bin 0 -> 12820 bytes gemBS/__pycache__/parser.cpython-37.pyc | Bin 0 -> 5495 bytes gemBS/__pycache__/utils.cpython-37.pyc | Bin 0 -> 11317 bytes gemBS/database.py | 18 ++++++++++-------- gemBS/production.py | 2 +- 8 files changed, 16 insertions(+), 11 deletions(-) create mode 100644 gemBS/__pycache__/__init__.cpython-37.pyc create mode 100644 gemBS/__pycache__/database.cpython-37.pyc create mode 100644 gemBS/__pycache__/parser.cpython-37.pyc create mode 100644 gemBS/__pycache__/utils.cpython-37.pyc diff --git a/README.md b/README.md index 13308927..58c62b2e 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,9 @@ Documentation can be found at ---------- Changelog: ---------- + 3.5.0 Make bs_call process contig pools from largest to smallest (this change alters the sqlite db format so + if you have a previously started gemBS run you should (a) remove the .gemBS directory, (b) redo the + 'gemBS prepare' step to recreate the db file and (3) run 'gemBS db-sync'. 3.5.0 Switch bs_call and snpxtr to use the new dbSNP index format 3.5.0 Add ability of dbSNP to read the new JSON and VCF dbSNP format files that are now used for human and non-human species respectively diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 3d1ee24a..ea73f620 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -966,7 +966,7 @@ def __next__(self): mrg_file = "" mrg_ok = False if self.no_merge else True list_bcfs = [] - for fname, pool, ftype, status in c.execute("SELECT filepath, poolid, type, status FROM calling WHERE sample = ?", (sample,)): + for fname, pool, ftype, status in c.execute("SELECT filepath, poolid, type, status FROM calling WHERE sample = ? ORDER BY poolsize DESC", (sample,)): if self.ignore_db: status = self.status.get(fname, 0) if ftype == 'POOL_BCF': @@ -1145,7 +1145,7 @@ def methylationCalling(reference=None,species=None,sample_bam=None,output_bcf=No sample_conversion - per sample conversion rates (calculated if conversion == 'auto') benchmark_mode - remove version and date information from header """ - + for snp, pl in output_bcf.items(): for v in pl: odir = os.path.dirname(v[0]) diff --git a/gemBS/__pycache__/__init__.cpython-37.pyc b/gemBS/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..851bcff1b299a18a742c43ed6c5bf49eb58850ae GIT binary patch literal 32627 zcmb__3v683dEU&O_k+VBMN!mqM{iLgsmE&VU9Ge$k(9JrOOz~Wt@T~4XNPl!qoIZ~ z)VY@uHG9W#S5D&?ekj{EeYq_a7;X|6hU2DbV7P6X1dR){MG>Tldy%9`nzq>>MT5Xd zgGG||`~Gw1L9W&<+ESV`=lwqa`<$5`7)XckxA4iobNk|NghKy`fzID{c+TSEd_5cr zsgNB~VP)0Bc38eG+mi2y9g**-9hL8x9h2|49hdKfos{pCos#dgotE#6ossXXot5ta zdqBQ(c22%KU_0d%H?kwpO-Pwl73RL-vk~A(c_t&xTaCvhz{I-c{M922@TBs=OLf!)lWn zQJdAM+M>qP)~{qAh3#ihVw>8I65AhHb^)OsY9~TF5!$WJsAp8+vtfIWdP(h8d*It! z4XM2gmfENGBeYMwtPZGy@aKJM}hT0AxbX=W4=mbLJ z2tBJ#B6PBnTs(}v9#PZkBkH3lH=$}a_ymer`aex z(52<&iY}CNbETmQ?wv}ZT3MRDQCO}mSL(G!1yLopu&kT+YpUX;OY^mQ&3#yKn*|hG zXzC>?qiPEa66)`rU%pdwD}`%8#qV(&&Ao@M6B>{B zDZSDty3JIFO&fR=PidP!UuP3YewBa>)q$|rM z_YMol5PFG)nu7tkE4oopQ&M=YvWiw7bh^sAIs;`L(wU%=n&X;5GgGgDUV$If4#w1I zx_4@gD*Ld&tr+yQUcwIQE7~Y6Rr>oWQ}Cc7y)8M;3RB%exzs4kS4>6Rg54=CbN%^8 zsFuWM1|R3ou)uC;jc_aU#A;ifr9yX8`m7sTwALb?<%JO&kxqKm?gIL0xO?X_=(bD0gXV+XtIxJadeX`Bu~MzA6ULyEfPqr=yJvJ2RmLsQprEmjaaDFD?D6&1mAwd zdFc4jlCIwOW6m-lzMm+~JJMjh<}j}xFO^Xxa=l)>`Y}J%gVu=vZutr3cPh(%lwi}( z7K?y^E)KL)Ec%%uAvzGkQn5Im_M=XvzTih0_LGF00KM9eN=soRFv^dcMX_TG(v#Q% zJFJH>tv?mg$ME(3_|ey#igu3Psi5hj<)*3}l{E_7VzJh!xy9nt@R!a3CB~dJ@X1Q3DzR7_4oa5-F&cG?)<3!;kQ_--*_yf99 zQgf(&rd}yES{aOaver=Ojc%nGSQ0GnC+3}Exm2(FnFqD%&E|BidaG9TGxHTCf$56! zQ)>Rk+_hp&t@|u_G3USwbk*g{s_ACp-t#9Z2b6Y-IskM@Z=j){fyM$Nk8KdacZ3#y@@&+zW{3 zBCC)WDjFA{9)5s%xDj!q;>N^{i<=NPDQ*hDK2%H#z=+KGp_}6wKeAA-;aJq&B0U+* z&@$GPOX=r=z$Q8=7dWvr!bFmNr{t7X8EO3vK8p2p?-z;Fo2ggrm)6 zz~T(Y0Rr4WTh`!5>&=oHkD7gP4dd`*ivnYltov1D**MQ!h0oyQjKB%?Z4yGg9e`@B zq0V&m_KK`8vJc|ZC6;oCHPiO+3z(oV2nrHR7QQ2^_f-UlyU~tv0vE$x^mNFJde+^z zzP)+?gj5s=-g**3o>7npv3A^xn|xbCUhH(}-t?`|>Xa7=^6Uxn#7xP%3B9l(|H7@% zy61kh!;6?dQ7BFnc67&-7wiBxtyhKp*Db$>7r>g8_)JvWYp?4|tYF9gL zV$jc+mlonKiI}*TM(@6d)_&m@Mz3P1FY`FO8uBtI^=Iv@n|Ksz4|rK`pqO0!eJ|r> zyCeNI8AZ;EsrX%zAQ!wGa-|yIKukhnUYD43<12_sO3b3fWE!7GOiE%riODur5tEjf zFG&kLdyM9d2l)i!N`r~xYv*2I=B8ve4)jY78FkH{ld4&!E zr@lnzRXQJo0}3g*=$O-Fx)dWt;rPjqRUV$#x~Zil&@Nzl8%p0~!LxLx>9B2mk(2I-ZZ0eT5oVn7CDUa4$6-Jd`cU%>80fac+@q2_NsYcX`wo_h8K=8X>OuyxZ*DM#z0Cr`R zyApX$cuCy=ct#vT`@FdRw3hF&$h?mgsx(P4kCLdYOyI!h_STGYoo1TTfq_BUj zx%b0I$5XOnG}l!VH|Q#zJ8=96SZr)rtTG6mbPWuGa-gIsKhd?g$;&8t~5TG z)dCh9TOk0^d4SC30N#w3R#-GreG1@IC91J@Zgtwr0SKRJ4`Pd@y+JJM zpqIjis;}4GTtGOi^-BQhb<`ZuKdX}MD7Fj;!Mm3Ju9sPhsFX^B^T$SH zyIVslg|?*++^~-=JsiYmuwnCVbP>F=mseSo8Bzo7VTnn2L*RUo*G+muDhC?k`|V9` zYAx+;!X_OtHMBRc`f3oP_|^6xdY`^a&iNIN$IZxiM$leZudQD8M*2o{Iv7!w<3^uY z>MbClVaffaPLI3+^oTRr9XEjYTNw4AvhGGu<^;%F)4~b1z_A9a@U#|wvD?BQ|G*Z0 zQINz5qUR2Ur1ECG=&7Zr+^iacya064qwqW7V7AOm=P9pvY&o`@Q^Uy97t6eAQ&10A z;VUYbMXB{Goj$3}81<+(8k8NuJkk#dh@xL=#2Sxc6}EU=P|q*BgWeW5|HwkS-}SZt zSZrxTt075^=qkn-Wv3FtMu}FK`#+v?;cwu72F}UWornvZ1c9Q4SQRJ z8ic&KYPGk!o4jq__Nt|}xFd_3)wVB2+B>`*i=%4$Rv_8-&ed6O=i=Df)+eF0ZBIh& zUEZ$85$F1~khcqR<msU1%&wF^6Nmp8Vy)7!;%9!KQ7q)GT-jL%dl z*Luc#2DN;xJMwNUdN(Gd&NE0u3H1!;$BU~1z9WyLj@ik(yeQ@>5zKEw?M9u+U>=f~ zhlum%ZglM#5B_k-Ev)SZcuzjD!l6bIE4_y^0}=xiHK4#AFV-%A}&6#^jtkhiMe{7(sIhl)tPFhy;o9RfIr&aha9KW>1w3CAE!9P zQ~Wsk<{;9EVb$9Q9%j7*Yx}rXAVznA7~K!~SWJ%^aunl#iQ_i}DWKg!?50Dj z%kBYh*P}3Yl)_0KZy)v!uN}miAA&yMEau^`w+Hga!)xP$?v8jzWVT}Z9q)*D5NAB5 zTd4UQ&c{LII|4%O@Does`LK7;JA&0agdKr+?1OZ36Im5}zr2j^c9+pW`5Q^XTm%_V#i3O-uTF#2a@Ha|9q8WBO0HKfQ76 zIEH@GA%( zYrKo_q=a8b_;>@OSH~s%dH0Aru{gOl<()8d^eF62$hhHd4Ly!KzM5|DZd^l&aVha# zl&ROa}oH|fQZGpR?w7)_$alVEZt`j{MI(5!b-YWs$F zl5B}LfK^H8pTlS`V6>CLD3ibsNC$bJxc6y{4ZW0nzIRe#F4vBGQ{K^z^p_by1illsTK=S*Ee>?N!>VqL5*@B)0QGj|h&Dwu~!j5T)e8#u4uF}qm3 ziZUtvsnxIZ6adx&s&bxD?|#fVU~e~kP{Nmjz5WrTeJt>Q6#mPB|9N!qtAq?*Lc=@*wuru z3agUVj|KXM4D@{*Za{Z{loV6c$J?g>!+@v0{A=wOg${WUyK(}(xsLsFdiC4hX~3gz zwa-Z3&!`)isWa-PxUT^#V;l&70$Adf%+|LtS1-xP`tk-Nd`U)lM(s!Gmt+plsDt9? z{GCw;#D4$~{&l1u!}llgeN@tHq@BXPP3!Aw8nMT*OR{R8@$po9FZHcsigWUzGZ8j3 z5#Xuf z>rH#J3zm1zo54=Z+#AKIdwJ~)%epTuzPxz0eIA(a!s>NZ#EQJyzUZ9;l)o5YgIBy) z9%uAr?-lM!wFB$U(0TP1_Si+lV($_cpg$K?$)sY|E})&uXb0c=Cy{$za`a{(=@nF2 z9fin6mvhh$pKGjnSYvP5X0jmUmV&D$xF5l|r@!tCiAQxt-Y z=j?>Lw9II5SEVKKWSz>rl?pgBhNK73ELTJ|U9Q-}U2l>41QFHdMIr$mNs!60GmT~g zy^%)ktP#l-omva}lZ={o8p|TU>jq4|1jOQO)6WDF4(U$XNI9f4S1}!dVB|xmOSd^^ z7w4ML_kztAt?mTH06Y?*lRa0Og=A4E<4*?f||ulog-|utTB|^wFXK#t<4mN)T$TQQD}VG9VoO# zY;wVfVMNYTVh=<#Ly3vlBICq%;W%Y&Zt?)D}6caQO+UX4vn) z%CSi2ll3NaDl}|>z*kHi{1|f`x1-2-T>l7bNOdwDwoSU2Vb3uGJ925(}t+De<%P<@<3m=$4$hP#u zMSTukZ|!=8Q>6=HcEZg=RU!7V==k4m-)wECw7kG+Wh^%h%pIE52F|9kt+{jWhrQPB zOAQotu(gfZ1db6c3Uq;c?b0j?`r+f6@~Zc^ZQqO0iO`|ks+D}ZU6lW=gz+5{Szv4X zOmn5Kq|Id-n*`&K0gT65saqE(r*BN&sNAz-(>DNwTFK_BiD+$_S#jLv(oAU?rWKVS zGIU@&p9KCV=^7)yYg~Npl z*RNgyNI~@w&_Yp}>H-Zrc8Sq;c1F|^1_s)hxn=_gS&~GJ*V>H5;87?LzMLvtsX5f3 z>ovABkv>(p*@R6|qXNs9N@2+)N`6?ItTY01Bk6!2egGq$IQhw zUSyGgF3=d3D}BUzebtZj`D;B6vG(U3v}q@CE*DCW2is9%Jw1UGAWWP;{SGsIolZj{ zxkc=x;KpWM*=dgQs^C%m511hVvjuDb2NsZ+4-p7Ec6PZ0t1i_E@n}KKOd#m!e~eOg z)|4uW*_53uH|s^B13z5zW7H}`(aaq7V4!jA*oYa#O0?t)x)RJs+#@)KxV3^#S_@dO zWnH;%r+CJL3G9F>J4z@egq9tJp5nMY+SOYZ``{3u6bCCynY%AGYUk?}TDIXhRg6#n zD7#{*c|TmXlQqmL+H>?##>N&nU>q8^WVNm4LZAHl zI7^S9|A5*Rz%dXV0q*(tNOATN2SA37e^we2b;{7A(1#dn5NboHKcRFz4qq1k#74x5 zM)+;CCB~)22(3Vdk-OvHW91M-YmyyOE{kxSb)&^N>SX>AxU69W@fO0=-Ry#P1{Nlv zJW{gI(y$C_3L(D*$vf;$pue%ANI@MDYH*l`J*SVcQPh&Pwn-n^FN9DdY+LYeSbS`C z6nXP#*~BxA|JZxxXAaKD#?M#~C;BB=z;G07ow^716n*ZMvErhkV6Tnli8>4RXaq6T zMDaU^JgkKuTG))?gI-w{YUE6C-p$N&4a`{b)-Y!QWw>%?4XELwF2t-9*O1!nw9L~E z)*mYDJS6Yj}}Ly8)oov6t<8TUW0kS zldv8IqYb+R3U?ZoM`3ALObC$%ff+Q;5{Ab4N!U3HeO8PgColPoVSw0~yxwpf3X$>0hGv&~^*^ zJD^~E2;uHPdk|W?P)as33+ts&R%q zC^(#(-q0G%t7T4T%6Q+4uEE$EhKnQUVHBw;wb_eS#k9hUA;;$>Zq$nfaWLeCpd+lm zCh-ujdyz*Nk+hlihA{FxR5W88dEeNeo`J}H5Hk_p5`w!`+-+Mz3sJ1Xq5f4s-MDLjzIcN{2iD3Cnm@S9d zzKO%`!Jc|Ti8R=ba7t;#;_P=p#QA6rjS%W zRhVmn0%Rjy%Vdlq$Px4uGzlUKQ(*F$MgwOtC2 zv9W?^4Ca^3`U>Wc#7Jk3VZ8}!0v45GGR@M7R+Q5>_uikAJ*S!7|A{OBc)nI`fp&pe zUn>psOh{p-AYvJ}ghLjht2NMzpQ^TuP3sY%vI`IHU~FWJK>I+r67-APu(!uR&!Ao- zT&d_H`iTi@;ui z#inYFg=Q-;<&-#x*0KIh8uGL==hMC03FutsSdDMdzkufY7!D!mh2i)aCj2BFVJZG0 zeLqD6MdKF@> zB@juVrD&*3>RRYl&~#WT{k!ZzLY4-r)iVwtaV50h005Jt5DE=!6MRjo!B7Ik(!{Ig zA8k%ZZy4GMv;k~wv!GyL#Q--A14d8)bWv(JLP}v5sDV5vCSqgI4xksLMUfbEt@)E^ zxr>m$jerX@3e7u9RV7Jp_+;Ut~Zi0&> z%>@Afo%IB)oIXGM>=Bb)u7w~cEPM;gEAx;N6abuHa|l{r6YdhyoD6P`fRR{PYIH#` zw`T|PnT-@c^}Uaw52GEvT3uFX9_0_#L^BJYQb^`+67Gm(I3TmSpz|f(-&L4?UB&BiE~8)7k5#3ezc7^Ke_xEu0(PJ`+k|A#fCL>-18X2>L3j%xdG8O_)yXjq8Y0cj zs~mD-ygb*UCmEbG$XNe2CNRKCn#zFIo|^#bXAZsbsZ(zr8Gqx{n{OO>6YlLdZmS~` zhv1Cg9v9KN{#(ecpTOx_Z(~6P;1RRPRW@3(zL{@_y)Y&KMpO&dk6HI{v8cB#AiB4v zj_L2A1cYpv@*Ui4F(N&ahrs;(^l9!(0*{ye1~(1ciKC|!Tg|Y<4eCLrPE$BtcV2?0 zll>+Ta2>gQh@?RDIF(lX?RM(zcDnWQCE#I!6_nZ@YETqc2LaTP3F6Wm13kREK}9h> zWbp8NsJ-u8bp?OE1;cqt(wBKq;!Xv^{yVs$gBy*@+GUsG5J|ZerN7ER?16?&s1}sX z!zd^k*B6Lp4d(kV@KNcs&=gE-Fl0nms`}rug76Z=b$7HKZ&V%>otAIm6QzDg<`Dwc z|IV5?oqlfpynTjKN_-oNY{LmO7f#6f@}FUWKJlnk@K6($-A>bDozGcho!y2AkHvz3H|R7n^Jk^fVt=INgmv`4H!H2a1%^84J${s zI_>4u2yiJZ#lT(~rjv1-yK#A`W7HcorPLOrSZmoQfKQ9CJ6p_|(32?GPit{djR{lt zDcI?4WzI#gZ66Iu8dye2Yawl$q@kZd+E9@8<4D^sX|VX)Q0`ku+aYPgK^n)i6AY;t zeHvB_WGH_R7FebwGk#cP1vxU3!_rYNBRS|}9R)8F%t=V?7CL7*SeZR)udI9;kPGZ2 zbmuA9|HRdSYO1{n)*i#&ChU|EFAH{b&Km)v9rIv)g`EMNxH1mLsLVi;Y;g0j|guIs%vK3PM@)5(_kzVha8gO2*)ALGwam zOimW&NNa#*1$}bp!sScjqQD^yV8{!ySRE1slGl5NxRDwJaTZX;wLUhf*9()})u@t> zLdWnWg(1KcScT$fwa^sBjG>u2LTdED=n^20FC1ENDhn%h=^aV&g1{VSJm?81A>wX_ z-MMUNvwr%>G%4CFWatr2qTrH5)>(w^rU=IT$%(w2uhL2gOiJ#uQ86)V<)bo-8Vxlbgkmu64B-M;PKxFTQEGxBxqHGIvr61Uyi8}QL{ zc{=dS%mtonm+7%4wNbflo1B!1@6GXKXO3H2CnqVscuJC7^#jgBuTa%Ze9q$2d_O$d z8tM%;*km2)0h!3wM?Rd`?(I&gLiE*gZxOqxW%2f_VYqvg^*1^5LI(T{ebjy#TD(f1 z5ahyn3#z}uxLG>q>0F?5kKT7bTaQk$b1US1&pP5Jsmeage?c@0Y*R`5Fwi(Oa(69sVJC>6o?*NCO}>xbDRsf z5FRDL6DB!?D|`B1LpmOV99)PRl>k$ME9q6qv5G6y7P?tGDi^51S(aV|R4gE=CG|Lb~NdPP^wDK4(wA@JP0QKR15N8Gibn*xX!Ehiv%?2lp^aIRb>{F&jp=E*^^;;-G zIPAyK89ajls}OSjn5lK}Hv#LO!j;+QI+34cv0eRbNUfY;=H#B3YONRVmg?mdTvBEh z(-b#EH}iO|k44)6pnr;^`bjz)U^7Jv8v*Vx!7(T(N7TXsFa?%!kp+g@fPZ|e1hNt= zG6!O3InnGhGub&aLEUf`-R-5OkvEu60h1R{Oaz?*CVkwm;$B_|deI1BDs{2c(UL7pLc6I4 z&~lkobpCSioW;i>O6uN(&II?K<+@V$37f2+oOWiwXoHAAJt6(K;ar##w66(@%R_MZ z$8YpD&}PEJJ9|a2NW2v7;kt7X_jch%JiycaWreJK2 zd-7l|LdA@igPWx$tT^E1;ZlSWDh`TRBs@2s7js8kn_I4w@nnvl*0t&#+=J6KTz{`u z76NbXZl$tZTxr072R5V1AC$0KF-A@(Hv|v{8n@UNmG6^&83aj;OMZO$UXkDidaJtf zv-AC-#GTS|y;)N}4NriGNruw4X8}U-5}GmatPedyiqf-;f|%kE+K@5nK}@;M(;1>O zOlK3F5js2Q;7uw+2k?$aNaIa4ipRb`xyZ!}RDs^n0%a^RTy!`9@iybmJJBcLMk9MlG0^~3Xhs21wF*RYlK|DYoz zf0aHtYl3vz3}F}GQ*INme^)V7CUummgyNhNf04)4!zfCMx{ z6sxA=ga$9610fRb;lm)%<5t1i1vT1w;~hh+kHYi?KIAo|JJ1e@^+LFn@4||pV&HOg zH^3Yh`f&%Y5zpZ|@zdY>IKY|gK`rqKuaQH-c6OFvCmF^y2mDJ9YsQz^|IQz=(OG;% zR@%jL07L7ad{J?P@rD9k{Q!fmc#}KrZ@L2I{X-Hc%?2lu)|2IAP&4i%L!2GJJTiBJ z51pYo`QtY~Gx*43GM;(OMq2-nw`}-? z4z@?JxXkWwazh{9oMh}OU`RA^S4nd|8kSq_(D-;^5KUlTar1|d703-7+z5nhcl8Tc zV`#Z>d-5}FyoE^?bup%2QdR@nvUVIyEADTvz7Cl*bj@CL0azBh+Yw)bdJ`z`pX~AV6#vrdHhGQHaDslFJNCOeKnB@~J_#TjN$nO`T zc+rL}f*ECt;SiW)w7^~L=7=q6qK5q|HwhKYx!O4}+CykRjeEmLL%TWPD}c6Zc!mWn zCk5AzDXSfCP;0lz7`feh9eu?^CHnPl?@oKkcI@8k%qhlf#KS)U9XfR8;xqk=6Av{WW%d|fd)^g<_kAlGlA7Sc?#O$OyKp!X1&J83Spi)0g)`t zQ;kVIK{lDBTd$nIem>xs!34i7C)rP4yL$C)318X3yO`k2ecV{u}3ScIr7( zcs3~BX;3QQMuj4g2DPnce^BXTA1qB1eLHdG`bB9wI#q_^N*Gu_&WjqCZjO)YUq=1@ zU=8+00!52;DD>+3Um{jhZllMs-OXYD6$W|Y4ZQd&gNGRucpyOH2!nYhP}T!8YXBdb zeN$Lfh3XpQ^pZi>f`-4#oc{z)hn)TmMo?*~-=;%EYbTdpcyWL)s@Q2=N{-H0z|b0(cDlw^HEw+vV59v#p2|bx@<)8dX%|RC zHdi1G_No^ldnS-33P=-uL|Qkx2FT)B=EQ)?=8jjZ$2?-FLlCXS7NK+JDbd&67zACS zt&>ug7o4+u&*OBk9?{j|Yy}3UXe-TgB4;8x_k%a?@egjjmw#}H6Hh6jPh$4~`EW<- z=eU!4yHNICcVihj@Gl`BjOQz`fHxb9+XW+#^|h4~91gdcK1E-zeensVJcxuh*gi20 zw%%TV{tke}o5T@o{#*`EYUxFH)@Io}?#_;CZg)qXx=mu$>tKt9v zViEwfX6fe<0@$D&9T$-Cpe>Ew;ET1R?YJ9X4RvfYXVM z^~I8%2YG>~Qy}+Q#Dm=w4uHH(!(@ge1tsoLl`=AS$pI08{$r+hLII1OQE5Eb04w-z z8p1(ZE?}H(l*z3Pf_*pdo!Q)ri+Le=UX?c43rN=c*mw4Xu_RH9FklF+EW5)n>4DJ@ z4RvJZHerTFpnc$e&8QrR3=7|b_@=@=kMBq|4BrsR5A5t@HGnB%m-}kNuoia zj;rd|y7M*CNyjX5)O?N>+6}N0tY_c}Gyv`vnk|5&8ikx64*;!=VT>e#&?k3mAkpvc z2h?G$Z36-T0ocFl_xNC8{|e`!GvN9}^vS2J&luMSWRe+whQb@P_l55KBix-aOaH`E zW-Ow4xxF`IL&5w6GZ*YB2x;&XlG#1OFs#6ydP2R*rf%G}Zrn&WZhJRwGh~M&-X?Ez zl`Fv+-+?sN+CSe}ce@884ln-*WFJNyoE`XfVEprD&X6O|Icq>N8O))CCBK}id(nX9 z+8MB1&k_y@#WfT>o@(xgigGqBcSMB}!$m$2a1cw(x8jqNBtKf|$;n1@Qdlcv!nF%^ ztrv*AA1>%G;M5zXQvHlJ^D|7lfj-=_8dq2fN>(25qNutK>@k~jo)Li2a*M9i--Ym zvR`XJEQ+Ut`64{;-u)6%LCX^J_b5lbo9%I$4K*N?-yX)1u(EM6+t8IFr?DfzB6bwF zy6@^Qv6_7>H(=y5GX}vJohq-W>5n4Sk2YZx0kLv<;f8R^SJZPboK`|>>fdFy9-bFE zBLZ~xFEA2^Erf)6v_K*99+WWCj#g{y)6o}t{9&S>E1b&S2z!*wi%9quhZ)5v z(b1e{jOZlF+c|Rlkqje`FoTctH6*XT0+KT%0u(+&w!OV$^@O(rgzVw=&cM8FrxBpE zcLgSHc>Ky46B9Ri2J7A!geYlfb>n!T1J7U4AiG`g3V5pBV1vS13NgFA-MZ!NHsWg# z%G9HQ~l?Y*lfy}jteM0=mNkEUdJ5^BKP53zL=2B=?gbKZX5 z;(|Hc9{5lWqMWaYm^@(~iNzyh6gWXh4#qi!XTZpsXUMVwI>96hrdP=?=GzB=e-pw6 zXH^tB=sfTcu|^hJmCfGPM~U`97k8!5{vqg#qj!lrKf;_4BaE+}#RFrwlZAesY#)X> z7oUT|<4w@ZB)l!&7#>0!_C_9M+IUim#$AAzVb$2XaE1Q+D_MTxtgl@Lw$N|BWRfD zjxN9iP|%A6!l=@F>N9qrFb(i4x4$U6HxJoM4$ck@MR1b{=3+7thy9h%-eMJpER%p2l6}1zk0oZC@}L5)s`FNWbc)*tzpJ543Qg^} zdN2Tx-WqlP2K`@OO@ht}T|GolqZwmIw%OGivfL#Rc=nZ`!JC;YJb`hvqt-UpCVJK{=ycWyyKygV>wi+3$49 z?LXxtM!J2YZAj;g_QjiC9s=x4Yq-FqQ~&vtXpkPr9$U@&hCDa7i(qxSy)sUimd7aHGY#PxZM z#5|AjP5OG~aUw$z@d+g-c%h~Ob$Sz*_lc5?>TD-{ly!*Nyy#^_+MrLP0Y7>fj5uDZ zDK)AULEn6Y0%yL@nuC;_9|h!e#wYp|;azQkK8lL^^y23j9Ht}6@vd4;A7dOzI3r2y zY2JFuIZ??8nJ9V(;n#cYy9gzz&Vw!jmt&5zK0XhiKSp0q_9H^Y9gGtxkQn$|3(5!w zL`-m&m1N8@Hu@6_3YEBt=p%N7oExU>qEGt-lrx0+HNP>jQxqDI4Gu#!NBJNvK8#F} z{%m+pa957E-=a|CAy0_bBIbp5(r3A_ehvM3kIPynvBW>NOwE=MPPZW|41_j7&tSZa zfgCj9&K@z4Gdghy1|T`#g!?d?6mIN%`EYw=b&rt!D#}*1s>Q!?@jGNH>ZdvJ!_2;wBF5Z$_W^RC!o! zLCsOTy^3q(&X`;kKOwotP;Ua{+E#qS+MpU0CYSn#3z81~de8#35w^rZ+aAPiMch~M z&T5LUeu8WQF~#>>J@)1ukJkU&7%#JqcX-t<%%EX+cYsXXuAcFBr~;V$on8*i_JLr= zc5}vXSw5`xpx1G{F0hyX`!KcD8uCaEbqb z_l&eff;R_&!a)%z9B>EKAu$o$iXKdRJNwe6k!Gy>`71!3YxtJ_mO703xu13i`)N0B zbXnTPj>>!R_l@{Sw)fsON*WQr)&*2-@0*y%5o#Ri~cPRz%Z?bmNF#{6Jfs5uFZN z>5b@2(8_)HlZ$db4+3`k9ZSU?^nD@bqRZG3a`OtyeZkA zhv35tpCI2O_r3$NexG*`ZR|jvsqVT*VIn!GUS(e%NADGG(R}vcs_Sl6A5)jrl`3xk zpv*kr!qN6P`ZI_ATvgZ5g9CV&;p1Q{jzXt6pssiNR86#xb$8|s#9AB+M#D334C{U~ zn5Dj+qUD>^J7Cs+-P_yOw_iu!UX%SAL)xEs`}@-V1Zkg;G`_Waa`lwE**hq`os!=6 zk1{5syv5wI#?ot7VRt^+`e9OfAY{M@fY$GW z_=d3wSOOt$kO6rsLfr)3jrpk~6egm}MF(t;p2;gBCKIt24AB z>QcCrmGN1~)}Xl%F)5tgLaogF_(>?-FR>$cD(HuFyCZ-a?UPIt&YPc<0Q0s^43;jZQU#z zR~cD{AS?`~4jq|t=WDIJ{B{!Mc}#%DLw^1U{ac1a zJFP6*qy!W1li*njnnPiTWrerk_%`5iShn>t!1}wj`3kPI7VzUB%iw<03rC)lfp&J# zXIXyT&F~dGD`=ib;11|3;!Nq+x$Qqm$EL#{xWd69!*IrA#4#&5XWnifWw7|%6ztxj zlbE{TxxuOV`POD6b$*eH8zQ*=lb3GJ*Q(QjtfBQx|E2_&B}U!_PGqnp_dYd4XSRb( zh5k&qr7DB!wSw*X^mck%ANF=1%4=*e#QRB1$Ww3TKqA;L!WDvl3*)F((_85Xck=V} z$vvFoFxEDJ)TWG_pwT?v832%C2Y|?;*oj~p+9@2@=7S=_f*&Xv<0$4lW$M_lJgD6i7o*Bh`eaR80Pt6Q379n>oGJ~CEnQkZ_%tbon|OxhA|bhSGT{ zXF`Q_A(O_pNBA}yZ8%_nhPO(T@_Q&e&?4rWn7TnsOQ zK~LBDITSIv#?vIw=4O!{}+kCOh z0)d%A$s|hRCy&2~*T@ipAvf+UMb!}MfyS>}&#$4LVIjdX-MBwM+@>z!gqnqL8fpDb zVMdfQd>jZu5o)$RoPy$y2;;Ue-W-uvYXm&^$&3s5Vok!@j`63k7%_pL(-yyiGCN`#ZV*r`ayPYR06p%W&_yuTg=ZpE3K44 zxsblW{P7W@J25}-{*M%5ogJy&RO`gLhM#)>M+C2UY9^5MO^pwHICAJ||9W^~10oPm zpB0swvG_`1RgFeX3t+vB0UF@Q*)&?XG<_Vo&d|xyImX!#he8tlD&l^hu;&0mp`^eb zK#&pKzez)r2K%fCApqYfgkL^VhCiG{ljrdmzUMB;Zz*NzMdStjm#msHK$AfzwmBU| zTjSd$ldPX8yYTZ%VoNA_qzvq;#h4luvF2gci9boxpWql+Gjs88JVNKFevVBF`u!97 zs6sPuto}58M4^7lNT{%sMtqkfts^k|XPM_)aO{M0kJfn4nzwU*hw*<%=R0uxT=xmt zDS1IQCeO*5+p&U`X_e;VS}Pv=(3WCe z#U7djpsKCSt``G;-8Eka}gbY^wHee+^yjvLkQ1^`$M7Z{{otg*4qF8 literal 0 HcmV?d00001 diff --git a/gemBS/__pycache__/database.cpython-37.pyc b/gemBS/__pycache__/database.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f73ed5945add9f2b39d7c91a7be0f12aa615f63 GIT binary patch literal 12820 zcmb7KTWn*;c|JGZDN3TK>)yE9E7xo8+Hq3XUE6!p+Ux9Uw_4fj&=-c{98#1hQkrwL z=F;aRK-X>EAZhl&2>ejARMCetD2gH|iXabpNMC}!6iEAYkf#(Vx-WSO{1Bi>zyF^@ zij?id6?Nv!nae-_{PWL$o4GnRmelY!`|rKCy0)I=tutekoO+;*cBk{;AC53)ZD@30FuD1L{ zM_SN>iEHhxtx8KQb*lEZ+i16{RNem@AoC-X*cu^#!F3F*V^E_MX=tUW9qYySv{J%O z3R74gXpi(#3i*hLaz2fGOvE`qhI~RKIiEp3CDNQ97stey$b6ueCd9ay_&_U7ib;{h zJ1g>HO5{*8C8os;N^;`3m=$v7Ica6E$Yf64oBI ziwF(Gj4TJS>q|GTynbsnNZ0MHHK#1rYK^AdJwfRUD7t#@<*l}`U)-{{+VbJ)YNfhv zzv!%2q%FD;Zd8Z{i4CXSDie}HwAyaf8uh@e+inmOYvopD%MOx`Tam8wUc+4vVr9U> zI0Z8>t4&A7fV+$%`Fl-f@cK>R!t0JLoeS%B#a+Kp#po9pkudGkqPYDqh&Z;pyFG2obyI{db`k&~Y;h8lo&xYaYmsCrO zTJrK3%iW{bDy@90?dGLjZ#b?k^I~_q*{D`rJHKf^EJT7xx!JC62C+(YXSX5kU;?Bq zSKC|Veh*2=1TkqhZE#S*;t2*OdV~J$tx`;Gm79QlaL!2p2w@!s1c&#$k<^nQaYpY> z4d%b!Zor&GBYKzyd1QuYz;k_3m6(IMVg)RjMN3JmDRY!eBPkeaL1w6c*kfo_lEQ9= z7@kIrLkp?_!}u_E`)vkXKrM5;RS*En(Mx)f zJzcK5@m`8@y0)`^U)wo)A0zbAM3dI5of|_KJg+baA@)8O;*WPSh5ic+wJU~Y1zUe_pY?^Yx#h&$eeFfIu1=q@3uDM}tL_7^&tgD3e z9gEgkN>A&>c!z7GOt5~ZsckQ8yJw)4G1|&AXx+ngxv~0$7_wlL%w5?CVs2y0ZtuE< zgj#_K)bWA@F=wZVmH1MSyw+~D>?*B95HFV-t%h4J^U4KdRSC&ym!UG6wga-S+ttm2 z6jcOI_TmS zy6xWqbE~qwJpf6}%wR`h(P#}3j^|Xiwm&`i7ay=%X*LfI=(wW;ZnxXb|JQ)`f5!|# zC0VJuifG{s4-PmosUgx8OnC&0DoHJn!r$6zxRBEqsq8c*2RN#TvU_+O(y$uy6RDA; z-klln&R5HQtZq~A0vaE2N?}^V)nq7>FX}-Gwf6!CXY+9&Oi!HOL7|fuD>%4 z+L_qjyhDAf^Sm%2kuUm@&N;89xfCo)8f#^P+pqs5>=C)Vq+;S}(po9hZQ^*0Z5)-_uB`w78($(5>tKv$s9t$yE zzf!E-6lAi=lPD}js%3ohq6Q}9NnlF*Q-(sS+TMOxcH3mvy@g&5@dc!`5r?Esvb2t4 z(dn0j5Lv*xo7-Rb{mvui!$5wV12B*r@`D#M6v`-p$rQ>82~ zBmr3N(>nYFVcbUD>DPvI@3ZkU81x_;UqIP59%X9sY{0_%rX~+FU|lj}2CTb**%0e) z7$gJ9GADDvH$o)=_Hv4DRG2U%kY~eCv8kz8(=%#1D-rI?#s_&Q+MeM@z^F+PpWu9o^GOe7n}+;3m!-KZ<)ID=@aG=-(wlKi;cE;~(Idn^bx9K$ zKg|$K`KxdyNtiXi65}W*OePrOm^VhFJl5p{H{BZ(lh~>mZl9sq`eR45q_)Kbj;DOOX=aqVfYoky)9qg=Z% zR2xSLLkUMPnFNX9NBje0toao3ne;1B3*?RWj40tt%{_sh^WB5^w zH90a?Fc@o6jTJ(QGsakd#yHEodjtY>qGmAnl9)Z}#l39!O&FSwO2`&$pwO?7(q_jejv2G!@{b!bu8jjTf%0Wz}gzq+)tq)fyYU(BBw?)61@ zAY3`oQDYRp$Qb*53bRUE55gYu7+6*2DLGCFDLzSYgMCdc6Zj0Qjdr6I81V6-)wj06 zEJ3`_20;Xhyt*Dl>axAN9YmV#_iQ+2;53fE#O;<)(U-aMwzRhqzEP}KMiyLzz(Nqa zd1pz=wp8Z8lrrZ?gOSp&u?w&&iQnpaJy;;f(EdzZg6jWpn>S1yy$XjzhL_+3d8o!J|Q3;i&Z~D>K z1WMCzSaO<$cM{P7q_9EA{8Z^8=W|9Hfg?PClSFRLFz}9}Kedd8B{Tw+%pj$Amg6Cm z4qG&AZZNkEESf{WbYKm9j4evm07dCw5tv9#=yZgH5V1pCk|K7{>-}$wzoW2tA~0ZM z4d|fGCFoa}w`9N~G>{;@yb<%EyS?~kOm29Ix(+!D9li4^y#Y7wB}h*0yhO0E3i2 z=~><+wVl^Cl77~U_EG{P?-_U}yewuk<)$~rdKqs@B%$w9d$4HyoJe7e9712|p5^64 z8fDX<4aRWJd8A=4aj!`)$NKh?7(;#5n|!2;%%+9t!H?x=e#+TKS!4EN?Tb9p(S%b7=@9dK5WObz%gVP9zy`RbLGYj>_M zbz_(BUAcYt)>1e6zF$P&)j~w6G732z$bC)~sWPScv{LN$$}8ooS8hL{sLkiJzlo|+ z;x@wBH>qhUdGF@(E4P*?-<^4V8$Jh#|Gsn2@saM-u=ozEx`kn1tKVKZTx169{}&1~ z2gEsI-5d+Jj%YZ`Z6(G@00bFT1?4R%tQ|xVW`JZ?j&v%lWPy!v?RK+LwTG0$Eg}Z% z1#T z{Yx8J!@`$^a>#nl&p_hm@FVG;!Oy}o16iF5zoT3>gL-HNV*#b)rxHAby2xWCxsKm( zaAMej9LRKvcaWz|3fuV+0*T5?C8<7$a&d?N5}w1u!o==AM;2;B`wPvbQUr)aLT<2IcIKK^12&^J#Nn9bm~L0?9mUh{rY|FJ}llp)>gDP zH8-{qhu0U|NceGhGLXarUywYT1b-(*9P%%TRw+LPnEi5`WbGM`bog!klCe5@mt%wP z41Z^!H^%Wy;F;{Kcw@l$HAwrc7-#7i-T8!kJeNE=cyRDkraEWci4QgMEep+nW5K%NO^`H4{cD)1%O-k z4`=+@&Q)&~eO>_M+=tp`3{v-dkgnz(&7W@N2?D5WMWC6e%mn2izg4Npu)hHrI4dxx zx=E6o%eGMl|I2B1KH~L+*NQRz?Cpp@hcO{X{X^b@$g+p^u6IaSkoJar7m&x{gBtKw zSO@rrp(PFrjN=``8-AiU5AAdjd8{EIE&%Ic3=M^$lIyKF=7AE-BaZ%))EBh)<$bNL z`A0yHqj(>~TIA7AVen7x^u|nYRzP1u`oH;^)HHTJhBms?c>%gXQL@OCJWQC*Lo3K1 zd&hcnmx0X+U>V&bjkqx(7XnQ%2_QM+OeL!uf&z z#$`NAvrqaffH=sax9As$1yH@bf5t7!wQe_+CG&+C>EfYvcz~|-Rt@gmf#pM zgR+o&7$(!>vAI(M%o6xf&3hU1&iN+}V5ngK&^zI!H#Lm%EAI%fVhG+*@5CQy{?XRN z!I1&`hv1nbk1hFYv`R04+u&uN1f@<1(?89pzM6lgbIU#Co#ef6*+1Jk?49vWKhnKC zxWi+I7^=UATAcZV_;1# z`#F@Kf(Ji`BXpDzxA}{-+weZ&Ez&v<=CsIsd16*Wxd9HdcD@D3C%sdEbQr5cys$_R zR6j*&wE9oB2@9|>QXxIbLOJP!k2a2Y1*As-EwW=^CL|LN6CdF~Dpm&1BAtRIx#$&C zUs#ek^gic5r>v-uo>|M>KA$Zy!SM0#dBV`7g{)r z_b)JBi(Y@6bKV7o8>sd#_lb5d@g5*NU-K91pxVZi!U0m`Ipna8s5^`OF{l|?Sz5K& z#xa(n(VZ?}Mt9!DdT71V1Do%}a`)L$qwX_(<@}iuuUwM3#pW4XbAc||a#RMz+f?^u zB(NK+HMSuq!`^g`ffHvO8kFMCe)%H*1?e`vac-@GvIr?tMx9H)3n(`ZTAlrXdHn93 zJGaVLuiaSfz9iQ2>dY&@UU^{W*KE6$uQcJ?+9HoTA_(_zu%`+i*LJqsD0v#^>TA1= zrhx6#{lgWqcyY)s@@n`z2Z`wPz1H4E>qe_z!4B_!An6I20E5Uc*|$`{M;>G#2)ze- z@6q)Y9O=g|GXW5{*V&vy5Jm4CuziB~rDkPoO;lcT0L=jmt$TJqWX|SDNUP!{oc~vv z&P&LsE+dGL2~DABKEB8HhZ0QDS+Ks$LjngMwZNfIa+;dLhHgH{Zy*F_bZ_A_!vQijnDTA?{V9gI~;= z&w1I6DUCT>I9%m}qOvj$~0CyhZ46n@4Gu%K`nxk?#*LL*lz z!{d-UMXe5_j@%~-n$evBa>dvY!>kCK1+*h)4X&96XG@C$qmw8nS1pIyPq|$o^AFsv zIieY!Yj~W~Uo4cSLT6DJ%CXM*LFgQ)+ zf>L)rLZ~c?u!_Zwz+t#XV+gm6i3qu3B8sqCnBQ@^9|C7SjMw&f{Bk(KZ}rA+RMUeNFgki6;w8bKvl;1HHWFbEg~@R zs0Z9Ek(z~afzt!0N~GZ|?AMHiHRn;28L1f$Yi^)sVx(p=tob@>vLiK9VNDk`IjTV@ z53@&_Mw+Xc{*;#m9dXqWfjASPM4G`Ay*aS;eUb3TcfJqf19WkI0n;x%6q<_r)7~`7 ze@d+o*wS}?$}J7K=uY$|RXB@RV3u8*MI0M4UEd+26K){a&3Ut7U<jGXMi$eCZ3 zUjfSmsqmXRz*(55VjoTeBSGVg$x8vf_rRc6T*edEyl@ZfE zSW~90__7w%IhZqRPE5H{< zHmM7XSc*JJmZUApwJA{%r*IY&aguZl&!QHpVY^-mLvBQB=~C5SQ?g45*(;wJU3wW6 ze?y{|W;BQ{4aJuFqH;L0M8b|lB8kAPhO4Fr9|*oL98U@-G!SfLGg^39u>B=6WKC_x z<%A1wgYE>lxYUB{iH5U?5>wq07`a0L@5d>D=@B2oje=5mDhtn%HZFW+J=cyN2#$j~A1JDt1@Y zFi$_t{neGG_UFN3^Qr9Y)@qGT9$jcuxohL%*f4T-%h`pifWqMk<>@euoPqF=^!xyg z{!HoJ1|<1j!vV~Oi!KKsg?;lO3JnLI2;Gt#p4D>q>~cH*phCC4t~Q+A=Jq=7{^eg; zx*ZbU*|w{VTEiB{yQiphsF9+`)-LXJfnF8de3Rtk^bdA%_6rhpPY(6CNk`dgE%#lX z7xJf^LR@|u+!!PqPNN0O3TFbawp=-6J|0KCT&3g^8eX&aV6c m3WLb5+kh`Xf=8aGga-aBjNp$d6264>(j3B6UFcG2I`V(#o@u@S literal 0 HcmV?d00001 diff --git a/gemBS/__pycache__/parser.cpython-37.pyc b/gemBS/__pycache__/parser.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b0e9fc97f94d24b7bb3ac69165661b34bef184ec GIT binary patch literal 5495 zcmaJ_&2JmW72gkXclj+*-?E)Bt`nPXA|*YffZ?=)WAr z-poASt5k{#zS#ch#y_qq%D<>_^5sxjLy8_EV+vDyik+3Jt;)7$YqHdBU6wgJ_hW@= zO#eh-x~J?YPjowvo*c`gC;vpV4W_=YG>xa=sI0drKlyZ2){vsNk$DQO0PbbUO$C3+ z!_F~-nV%?jo>{Db(qKhaLTO6A6;W+_{jKQTFxd9n552?n7Hxt)Dh;G}k)j*O5^Pv#MP6^4(@r_b!%3jOD7=K?+0E4tBacU` z9nXzBtF4fEtJurE_b2(lS99QFfreuFlu8UYX~p7vtJko%Emfq-ro5=b1V(5@lqL0oz!6qW((73~N+C zUL30cPy@8ZxU^Fml?mFhCIKB@IZ7V@!^X{{^k##FUepM}xZxf8QM@8d=54!sy|`Hs zS{RAk0AL714n`6#k=qIVKoo!O_V&CVaUODEY=^w>#-i*|)9HCGL(Rk{h2ahcUciJo zQ4J^HL@^5Yc+2aAVYjJqasXil-T`S=>LSa^=;sT_Do9ZknWC1}>uOz_RjUO0(sQ7n zac!wYjgLxFx3oI=7p!eZ8L5CwakN<9(MJGPf<@M*t~c|U-1;CK@OjWr1V+=z*f&)v z@bhvmgO+5Cq`gDV7oU+xcodaXJi+kA$a8CWSRs`2Et_FOLrrIPXF>w2y!|T3@$FMjp}e_)b%(w%G0W2jeieyBdcS?l)BHa52rS6Y;GKF-rD#$-TZEI zBi$TUQNKg=JDaywZZ}09V%&RwIPJx)?1*x*qc8%{8ux_R3i|^(D|PO*MGo0pqB7{V z9qwh89EsXV)yW)OOrLg;FOo(?HF_LPIMX5f4dzd0;&r5m{F~gAeivLoBK6|AxCC;V zBO&`Mz>DDoOnsuIT8ztp9AZaf+NXM|Cwh*~N&WjcOdWui=nhnd9BNB*KDo3(`3`QV zyNa)*d9XBhg1DGo`B+QuA0p{B5G&SB~|~Pf{x}nAO$zSMoBl5{nhlI;L5{ zeY8qw{ehLydK)ckNf{N!3jZRp(n4a5ien`$vI<5Q;}Waltkp-Au@YAk>&fI?-VLxq zxvL$keEEz6S_*iSCOpclHsLY#jXWxn$48ljnExg?TWnhPuCY4iFGGfzi}RDhSf|@T zN>P)v0<&gi?e9rB$#qTe`WvEsjX18AuN{xx7&KKw7ZPmQJ%YpmW^HvgGclj^9>uyR_%$joRqsi8iV zOdV_dUNSWqo0hf8Se2Tn2j!%kPA9djmDG|7Wwsl{_U^OF15R7+~a zIhjc+N$sw3tnFHll*3y|WwbC>u-~g6Y3VG+t3PiVK1?lS%ba8uCwoamQNQBe~~Oqka-AX7J&?O z0yJ}c?Ql7n0b+Id9qDN?<9?jmy$wV!j7e=k3XEY#$m^hZkmM(v?_``0S0umJ(&7oX zCD2C0_}b_tb^{m}lSO)msl;I@`Sc($-IlMe3zawZ=iZdV;>sJVN9m2V+dsS`Oh0J# z_Lw)+8-+^K;PjjkW*;vjKWO8ac6&Wp@eX6|wqjv$ZxHfWwbz zYlw-L@M^f*vB%|e@tL|goF$@e5FVUyIZB7GUOx3!CK>kExVd?=DU=U}=8dDHqcly2 z*S>KrdYL{`Uw@`H9^f)F@7IgpQh?gAiY&vUOI?t6IUGk@P_cuF{1 zZa=aY_b|+HDemK0FT(!z+F( z!m}K9<735I*Yg1K9u{h0mB^lv9e#hn!-2=0f$Q_=QlwkkVAF92z0hZ(dT!^w8_DQw zg1#tCHr+#OAa(4l|JV|_2~Jnq@0tnIfC{I*?? z4X7g$7Ab`!76#SW7XKOKLP%-?m>u{Wkd@vlWhPA`f_NQ;v>5&d)mA85MK)Z0b`JUxXu&@gUA!n@)o}G|LMp2@8(D9+N zfeT~Q!C1bTZDT)+f=xeidxMTE@<0+{S)HVrp==fzy$c6Cdtl$7wzW-e1b@f-l81W( z{wnq65#F^roMJFxdcnTW!vN<~?De61aOh!eyFf2~c>vkP!CutK>U@qS>3B^Fvlq78 z@~At=aA+=Nu@oIXLfy7Og?N2(HX-;V3=qc0ut>XrojF_Z|E|!nh;30jgjw^CG24d2 zX0gBQqR35dttZsJ(7WDaq4nRg3#~oQWxQmY(cYFr{p!9@yLK)ho_WeaCQmm93)ni? zo{Kyv=peo>yM96( zF)k4aadjrrA3^qQBApUmfQD{nDbh($zk^riNo-B_`fZ3H9;VaL#h+e|^ZB{;uU!Jk z8Rdm2^r8Th|2#4QCVJT>DK3rhISu{k+`98GtSdc#y;3@f&D`f(a*n)3e_ejRmr`taAN=2J#nlth>D71rJC S|AimWOX-9;+yOtWzWjf4OP6K< literal 0 HcmV?d00001 diff --git a/gemBS/__pycache__/utils.cpython-37.pyc b/gemBS/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31d7e161b129d74df698bfbd5a7c716c1bbd5e88 GIT binary patch literal 11317 zcmai4OOzW|TCOUUq>@_QZa?F^OeHgRB97Aza}EPzjK}dS;}A3^c5o2M(Uf(oyGvF} z-M32p(2?c|7Jh&GPv86fKYQ1*{)G?uuZqkKT=Cx_QC8bhwsQJ*+m^f2cH~}a zm*noYUAdRrWw}?{6}ea2RoqK`Z>!d>+1C4(a#i`MrOILbu+(0{b46A0Ts?H!4Lo_O zhNqg8F5|hbmhilU(iJ>4)H0ry@pMA1s1r}E_NrP{CviWiUQ=h(*{62EWKzc=s$rO;?Nxa=pJ;anfxeh|lgccgVVNcso<<}gms8|rk2TU#6< z9PCDVIM_n@nqU74G__hyJFVRN;msfpyL`@nHZnJG#j8j{3lz6VZBUvkD5a~nOrLl5 zyWuuxGrm}4F)uyb8~Be8e5^AW4*QsvsUEkQZt8V9gJ3J{bkcgKvo%zsKJ$%EXJ-`j zvyy73qlVp1=Sxd_n1Li;U%K*f9P0SWW*8)!SGq$LUJ*iODajeHZ6D|oTAbnJT_m3E z+VW;$eGW=)u!@glaebkZxjWwN~g7*z8O*|BMlLtlp*QH)Is zRo%yLFG@CLlgxH+1<@cU#G3zbwlPvU+S=}i+?s&o^f!apN2kSGKiM1REV5RY7Wrsz zH~qcMsJrP$pmWgghiW!*pf|PyjcH1%(d9x@e-Q5FQ}X-KAoPc_oAG6`q^UX@aDW%| zFWo91!;Cohcum@LNnZYVCoA2;CA*<)kkc9K_mq0!*jzW+b{qVhQIga=wiH7H7Gz6% z=N)Nvyc%op*AgCTI;k6n{q?kBTFtmi*D%M_%Zd61XDrL8jB{pR%M1VC_5JEcqk+sO zK&rh1%s`4sgMr{c2e@%feXXwWRC=8l&#UR=ZbRo5En|+4OV&u$%43)RY|!}d9EmiO4_(@l+_)rhkCpt zH&7y12(quHhQFtS?d?#5c|#$V4#~j*)!k!=f^d?EPzX6fJ<*EMV8b7-8~(`p8EQXB zd=Masw!&7IbLEeBlH(Sx_#BeNI3K;;yP`!4I=NP8V5-;%Yl2#>W-Xc}!&TL60{S#4GRF|Y3#ahzJ0utBW>p*Z`0V|; z?Oe-yZQ8A7`MJEt9_D%#hLf&oN3P&9I@y6v=9@0A;zzc>KNP>ZHGqlt>*<{ z-0E`X{P9jTAY616nF)BPWBA9Ol*mOB7inoyK?(qZv^=RItxU>D0X>j(89|~De1v@jyBCp#GXpMG5Uxn+i5Xz4R+oMEe0n${aAOI`P++`~{eJ<&3n>Q=wRgr-Hm%;YsDv>*CaCZZZ7 zDOAr5ypO3ttd)wr_%~k7gt*W_UE&JOsRJO|)+Ki<4 zD9@J*j+j*vJ3FT}kcH%ul_*?=xXyROjx%wd?@)&`CD9sRc#z3|u3~boWY-tt)Y*GH z{*}Ar<4np_n1o%(=4CK-IG{Gm3gSeE!Io(@zI41>vs6yk2%c#)J}snTFY)McHif&* za$1Joi3VxKs2{n*JPXUFwQheHhn;m5k6$mE=hHXiWD;;{j^)YdX+46D2X+~B!?c76 ziwJFC__VCT$D@tZ<)YI1Y%-~P|K9yOMqtED`|PKDpY@DXQH;{E8S!=GVU39y^)E5; znGnCWm$MlH3}B0oiyR^0C%9s`-4G)RoNK$Ky4|o>>=ma{I)z-r_UsDY;m@f%uQ?ua z4ZGrum+l)jBl0X{kuq6eF0q!7MP)tl^a?FtLaDS8Cn>EvkiW|b<!2l z#9s#peWB`zfn!l;H|UQ-Q6qRz5&X9yV-$EGNH3)Q-ODij03OocykHB|EvmJ~FJ;8{ z_u%TXFUW|7kdM#5px}-1FLSaKT+TORjZC2#S)e8WNQvUj@#}dnZ(WSv%3kK4@kJ&4 zG`CnR8y?`* zbPyhwIf~j(am79o%gyA9(ghhi1z9^~pM!kWZQnj)k8@{gjWZ~ib3w>B!-Onm7h?6A zvX3ZA_A?33_4~du z$gtZFgTZJ!AAMdMh=U%iiwj}|Fhk!4?`LpB4Kcw5o!~TWF|f$ITW+M!es!spn-V^Ua0Ggp=UTer#gG#b&H++36e>9 zia`9dV${N>|GBvDseBV!u~;x9o1kd zZB>4WO0M2Ou>OnEv^uHwmZtEtj~cYF2{3eqpk3@Q!uelRryfT-TA6rajXhes;4dMC zA0ads(#ZbHX_ztTy-$%pGrs-Do``P=#3} zw@V0<%ub#be3?t-4DO-5frm4Uj7~Fz`@CMe!@)rC@Lb2Jp7%T#E}9K8>-)~^6o6AK z+(fzyq7XoU#d+tJ7Iu0s?Dywf?}rR3@E?H&1gS)<(Od#yh_2905N(rDMnYKG!el5x z)dqQ>>P#5KJb(2+_>J8)D2UWvLx8IikJi_teYl<)p$OcK5CIm>yi6ivW2WhEHczCb zK0x#L_*P7OZ6i!R6vdO4`{8cbPs_LO-2C``k&e_$AmS9?^i&HwtsdfWr~`dKZzO%a zv#8WNos2=`JVO_fmX>8*Vsg?7gPa4^tQn9{&6>P$%I#{BO~KwiCZi<4=822)aXcd8luKK#iA}G7hQmS8mKE0{4Y`K!79x^u!U8AH;({Xn z5%_tZydCuSf&<)S8y5mkxq8R~MA>68bW6DSYx|1+K8QRnUyRo-Dnv*`8Gj3}^bAgw z<82%T8jyLLMda8lW~+ajFQnC?&Tg+Pa^OFro*dX&H5WYLIV<)lhkU5Ryk&`%>KjKc;yk5GtLHIwt9d}Vo3Q6HIHWoL#xUK z2b9O>#CbLnpWy~7iSe?$2eiLhuU(9Va6+{f=p}ONMuYWX+Q>j-hmDL>3o;Zhxp01) zjxl{)!Eq6SlMg=lH&@oQ#sy#llDBNpOu^0N2Kthn}O?YJve78#yhjI^D?>yp6C6*&jg>w`L;Rh3U>g;>~ zvhth)sl45v;PCa4H8^FXC)=X;*|C1l8aSVbuZK~~@xSi3aCgz`ZP-WRR|@qJd`{#O z4?g5kQtQ=^miYY4##7^H8Fm&=#&b++u$fYZ*&fNpQh3Ybe|`i=CNfJ@RzMZ#{)O!n z;mM8}X&;R{CVrzF?eHX~63oi;aKq4t;!!r;*0|(P{6aLS zHE2&B;EK;6A>7XNgZ0e%Q)dUVFLHr{pTS1R32en0 zUpY4Kc{Q+htv|#WNPHccye%}qcQKIBqR)*BY0ma^K7?hDU;Tz5nq{~bt-FX7aGl4` zSH(enn*~iKKVh32MVI|Mi_T9tIA z1AN7hsReN}Q)Ovn*KNRSMU zbAmHI2B-1>pm%LuG3t4J{KhT#Uv!)ZO?@~yP%LnORTy^>f=Pxt&I1(`I$54EnZL!y zRU}xgoI@O2?TKQw2AE#pa#`P!rPdLLF~X#Ogm(|`7AJ$UV|XPy@?&I_J@v3y7jgXZ z#8b`}?zFc5DPx|ZI~PwQ*zH+c;5zq z0y(D3lNyZZ(xfq2#wlY}mA@!MH(!8mt{r(oyc*$EP{pQ--lU5|U1~xLvva$-m!hUe zJbm;7`4S`3yFO3k*-jh=joW;|5>Dj^)lbgIAx}n zyc_e5puP`}{vI5;0GuPU&Abv*fBy6=D-doI)Jl8_mc%J6j=UraBV}g!(w9`rMqW6H zn?t=~&E&H#e*^KJZE2Xr4vI+_fARnerzp+p@D zudKad&SD^RvUo89sU1mRzg^GXLh-bh@;o+RPIWTEkk*R0+1r#>F2q_zOdnv5F$>S} zkoTPPCC}ChWreDu)+^znQV^9(<+Ne11E>9rAyRvMRmgW6QKk*7UcSMa4aV=FaF*I* zm;k!~DUcZ4urH0EU^rOQX}csxT6uyf-2eTA$IJU`aqgUv%tn?*ce~gmg8(FqCf{C>~O; z7>dKYxK87ep~QBdCvg4TdQyqq&yhQ{V<@OOJ8q1*qd!Hf)ZRBARpAWFi^Fix!3T0*keb&)K_D_Uv zx!ixfh1|jtX})gZirYx`A07a+1KI%2*w%eu=N-p|Eqzkg_idcg*w)yGO~ta0;L-pi zUr$_|+rYK8fwfs*;csgH*!~3Qdk5%y$KgBd1EA4~n|I#7*YfXta_hs7AKd%hJE^Av zs0lc(8$bVZ{{H&=%}QF$J~z-OF(GmLZ~#cpBqoQYEy+hv-1V(Ibq~thQRMhj&}8<~j3S*dq=t;%z*v zUOeRKXS4xLt$)D69VDp}4UIXKsgft<`!I7#!l=5a41okjF;EDEp=BRr>^TAUSSlOn pT{9nlUgxWr6LBH52ECfJ;jFqVFRiYu^jEe3iRv)P7rlyC`(LDdG~56H literal 0 HcmV?d00001 diff --git a/gemBS/database.py b/gemBS/database.py index 140d0ed6..c096363b 100644 --- a/gemBS/database.py +++ b/gemBS/database.py @@ -85,7 +85,7 @@ def create_tables(self): c = self.cursor() c.execute("CREATE TABLE IF NOT EXISTS indexing (file text, type text PRIMARY KEY, status int)") c.execute("CREATE TABLE IF NOT EXISTS mapping (filepath text PRIMARY KEY, fileid text, sample text, type text, status int)") - c.execute("CREATE TABLE IF NOT EXISTS calling (filepath test PRIMARY KEY, poolid text, sample text, type text, status int)") + c.execute("CREATE TABLE IF NOT EXISTS calling (filepath test PRIMARY KEY, poolid text, sample text, poolsize int, type text, status int)") c.execute("CREATE TABLE IF NOT EXISTS extract (filepath test PRIMARY KEY, sample text, status int)") self.commit() @@ -335,7 +335,7 @@ def check_contigs(self, sync = False): # And make list of contigs already completed in table if not sync: - for fname, pool, smp, ftype, status in c.execute("SELECT * FROM calling"): + for fname, pool, smp, psize, ftype, status in c.execute("SELECT * FROM calling"): if ftype == 'POOL_BCF' and status != 0: if pool in ctg_pools: v = ctg_pools[pool] @@ -358,13 +358,15 @@ def check_contigs(self, sync = False): # in sync (which should mean that the db has been altered outside of # gemBS) and we can not be confident in the makeup of the pools logging.gemBS.gt("db tables have been altered and do not correspond - rebuilding") -# print(rebuild) for ctg in contig_size: ctg_flag[ctg] = [0, None] else: for pool, v in ctg_pools.items(): if v[1]: - pool_list.append((pool, v[0])) + pool_size = 0 + for ctg in v[0]: + pool_size += contig_size[ctg] + pool_list.append((pool, v[0], pool_size)) pools_used[pool] = True # Handle requested list @@ -390,7 +392,7 @@ def check_contigs(self, sync = False): small_contigs.append(ctg) total_small += sz else: - pool_list.append((ctg, [ctg])) + pool_list.append((ctg, [ctg], sz)) if small_contigs: k = (total_small // pool_size) + 1 @@ -408,7 +410,7 @@ def check_contigs(self, sync = False): pl[1].append(ctg) pl[2] = pl[2] + sz for pl in pools: - pool_list.append((pl[0], pl[1])) + pool_list.append((pl[0], pl[1], pl[2])) bc_list = {} for k, v in sdata.items(): bc_list[v.sample_barcode] = v.sample_name @@ -421,7 +423,7 @@ def check_contigs(self, sync = False): st = mrg_list.get(bc, 0) if database._mem_db or sync: if os.path.isfile(bcf_file): st = 1 - c.execute("INSERT INTO calling VALUES (?, ?, ?, 'MRG_BCF', ?)", (bcf_file, '' , bc, st)) + c.execute("INSERT INTO calling VALUES (?, ?, ?, ?, 'MRG_BCF', ?)", (bcf_file, '' , bc, 0, st)) for pl in pool_list: bcf_file = os.path.join(bcf, "{}_{}.bcf".format(bc, pl[0])) if pl[0] in ctg_pools: @@ -432,7 +434,7 @@ def check_contigs(self, sync = False): elif st == 1: st1 = 2 else: st1 = 0 - c.execute("INSERT INTO calling VALUES (?, ?, ?, 'POOL_BCF', ?)", (bcf_file, pl[0], bc, st1)) + c.execute("INSERT INTO calling VALUES (?, ?, ?, ?, 'POOL_BCF', ?)", (bcf_file, pl[0], bc, pl[2], st1)) for pl in pool_list: js.contigs[pl[0]] = [] for ctg in pl[1]: diff --git a/gemBS/production.py b/gemBS/production.py index 925ca0ca..79aa6ff3 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -1114,7 +1114,7 @@ def run(self,args): mrg_bcf = {} for smp in sampleBam: ind_bcf[smp] = [] - for fname, pool, smp, ftype, status in c.execute("SELECT * from calling"): + for fname, pool, smp, psize, ftype, status in c.execute("SELECT * from calling"): if self.ignore_db: status = 0 if smp in sampleBam: From 8ec66fc6b479fd958415e81c96f3e053da5f564c Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 12 Feb 2020 21:11:53 +0100 Subject: [PATCH 43/61] Removing pycache files, added in error --- gemBS/__pycache__/__init__.cpython-37.pyc | Bin 32627 -> 0 bytes gemBS/__pycache__/database.cpython-37.pyc | Bin 12820 -> 0 bytes gemBS/__pycache__/parser.cpython-37.pyc | Bin 5495 -> 0 bytes gemBS/__pycache__/utils.cpython-37.pyc | Bin 11317 -> 0 bytes 4 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 gemBS/__pycache__/__init__.cpython-37.pyc delete mode 100644 gemBS/__pycache__/database.cpython-37.pyc delete mode 100644 gemBS/__pycache__/parser.cpython-37.pyc delete mode 100644 gemBS/__pycache__/utils.cpython-37.pyc diff --git a/gemBS/__pycache__/__init__.cpython-37.pyc b/gemBS/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index 851bcff1b299a18a742c43ed6c5bf49eb58850ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32627 zcmb__3v683dEU&O_k+VBMN!mqM{iLgsmE&VU9Ge$k(9JrOOz~Wt@T~4XNPl!qoIZ~ z)VY@uHG9W#S5D&?ekj{EeYq_a7;X|6hU2DbV7P6X1dR){MG>Tldy%9`nzq>>MT5Xd zgGG||`~Gw1L9W&<+ESV`=lwqa`<$5`7)XckxA4iobNk|NghKy`fzID{c+TSEd_5cr zsgNB~VP)0Bc38eG+mi2y9g**-9hL8x9h2|49hdKfos{pCos#dgotE#6ossXXot5ta zdqBQ(c22%KU_0d%H?kwpO-Pwl73RL-vk~A(c_t&xTaCvhz{I-c{M922@TBs=OLf!)lWn zQJdAM+M>qP)~{qAh3#ihVw>8I65AhHb^)OsY9~TF5!$WJsAp8+vtfIWdP(h8d*It! z4XM2gmfENGBeYMwtPZGy@aKJM}hT0AxbX=W4=mbLJ z2tBJ#B6PBnTs(}v9#PZkBkH3lH=$}a_ymer`aex z(52<&iY}CNbETmQ?wv}ZT3MRDQCO}mSL(G!1yLopu&kT+YpUX;OY^mQ&3#yKn*|hG zXzC>?qiPEa66)`rU%pdwD}`%8#qV(&&Ao@M6B>{B zDZSDty3JIFO&fR=PidP!UuP3YewBa>)q$|rM z_YMol5PFG)nu7tkE4oopQ&M=YvWiw7bh^sAIs;`L(wU%=n&X;5GgGgDUV$If4#w1I zx_4@gD*Ld&tr+yQUcwIQE7~Y6Rr>oWQ}Cc7y)8M;3RB%exzs4kS4>6Rg54=CbN%^8 zsFuWM1|R3ou)uC;jc_aU#A;ifr9yX8`m7sTwALb?<%JO&kxqKm?gIL0xO?X_=(bD0gXV+XtIxJadeX`Bu~MzA6ULyEfPqr=yJvJ2RmLsQprEmjaaDFD?D6&1mAwd zdFc4jlCIwOW6m-lzMm+~JJMjh<}j}xFO^Xxa=l)>`Y}J%gVu=vZutr3cPh(%lwi}( z7K?y^E)KL)Ec%%uAvzGkQn5Im_M=XvzTih0_LGF00KM9eN=soRFv^dcMX_TG(v#Q% zJFJH>tv?mg$ME(3_|ey#igu3Psi5hj<)*3}l{E_7VzJh!xy9nt@R!a3CB~dJ@X1Q3DzR7_4oa5-F&cG?)<3!;kQ_--*_yf99 zQgf(&rd}yES{aOaver=Ojc%nGSQ0GnC+3}Exm2(FnFqD%&E|BidaG9TGxHTCf$56! zQ)>Rk+_hp&t@|u_G3USwbk*g{s_ACp-t#9Z2b6Y-IskM@Z=j){fyM$Nk8KdacZ3#y@@&+zW{3 zBCC)WDjFA{9)5s%xDj!q;>N^{i<=NPDQ*hDK2%H#z=+KGp_}6wKeAA-;aJq&B0U+* z&@$GPOX=r=z$Q8=7dWvr!bFmNr{t7X8EO3vK8p2p?-z;Fo2ggrm)6 zz~T(Y0Rr4WTh`!5>&=oHkD7gP4dd`*ivnYltov1D**MQ!h0oyQjKB%?Z4yGg9e`@B zq0V&m_KK`8vJc|ZC6;oCHPiO+3z(oV2nrHR7QQ2^_f-UlyU~tv0vE$x^mNFJde+^z zzP)+?gj5s=-g**3o>7npv3A^xn|xbCUhH(}-t?`|>Xa7=^6Uxn#7xP%3B9l(|H7@% zy61kh!;6?dQ7BFnc67&-7wiBxtyhKp*Db$>7r>g8_)JvWYp?4|tYF9gL zV$jc+mlonKiI}*TM(@6d)_&m@Mz3P1FY`FO8uBtI^=Iv@n|Ksz4|rK`pqO0!eJ|r> zyCeNI8AZ;EsrX%zAQ!wGa-|yIKukhnUYD43<12_sO3b3fWE!7GOiE%riODur5tEjf zFG&kLdyM9d2l)i!N`r~xYv*2I=B8ve4)jY78FkH{ld4&!E zr@lnzRXQJo0}3g*=$O-Fx)dWt;rPjqRUV$#x~Zil&@Nzl8%p0~!LxLx>9B2mk(2I-ZZ0eT5oVn7CDUa4$6-Jd`cU%>80fac+@q2_NsYcX`wo_h8K=8X>OuyxZ*DM#z0Cr`R zyApX$cuCy=ct#vT`@FdRw3hF&$h?mgsx(P4kCLdYOyI!h_STGYoo1TTfq_BUj zx%b0I$5XOnG}l!VH|Q#zJ8=96SZr)rtTG6mbPWuGa-gIsKhd?g$;&8t~5TG z)dCh9TOk0^d4SC30N#w3R#-GreG1@IC91J@Zgtwr0SKRJ4`Pd@y+JJM zpqIjis;}4GTtGOi^-BQhb<`ZuKdX}MD7Fj;!Mm3Ju9sPhsFX^B^T$SH zyIVslg|?*++^~-=JsiYmuwnCVbP>F=mseSo8Bzo7VTnn2L*RUo*G+muDhC?k`|V9` zYAx+;!X_OtHMBRc`f3oP_|^6xdY`^a&iNIN$IZxiM$leZudQD8M*2o{Iv7!w<3^uY z>MbClVaffaPLI3+^oTRr9XEjYTNw4AvhGGu<^;%F)4~b1z_A9a@U#|wvD?BQ|G*Z0 zQINz5qUR2Ur1ECG=&7Zr+^iacya064qwqW7V7AOm=P9pvY&o`@Q^Uy97t6eAQ&10A z;VUYbMXB{Goj$3}81<+(8k8NuJkk#dh@xL=#2Sxc6}EU=P|q*BgWeW5|HwkS-}SZt zSZrxTt075^=qkn-Wv3FtMu}FK`#+v?;cwu72F}UWornvZ1c9Q4SQRJ z8ic&KYPGk!o4jq__Nt|}xFd_3)wVB2+B>`*i=%4$Rv_8-&ed6O=i=Df)+eF0ZBIh& zUEZ$85$F1~khcqR<msU1%&wF^6Nmp8Vy)7!;%9!KQ7q)GT-jL%dl z*Luc#2DN;xJMwNUdN(Gd&NE0u3H1!;$BU~1z9WyLj@ik(yeQ@>5zKEw?M9u+U>=f~ zhlum%ZglM#5B_k-Ev)SZcuzjD!l6bIE4_y^0}=xiHK4#AFV-%A}&6#^jtkhiMe{7(sIhl)tPFhy;o9RfIr&aha9KW>1w3CAE!9P zQ~Wsk<{;9EVb$9Q9%j7*Yx}rXAVznA7~K!~SWJ%^aunl#iQ_i}DWKg!?50Dj z%kBYh*P}3Yl)_0KZy)v!uN}miAA&yMEau^`w+Hga!)xP$?v8jzWVT}Z9q)*D5NAB5 zTd4UQ&c{LII|4%O@Does`LK7;JA&0agdKr+?1OZ36Im5}zr2j^c9+pW`5Q^XTm%_V#i3O-uTF#2a@Ha|9q8WBO0HKfQ76 zIEH@GA%( zYrKo_q=a8b_;>@OSH~s%dH0Aru{gOl<()8d^eF62$hhHd4Ly!KzM5|DZd^l&aVha# zl&ROa}oH|fQZGpR?w7)_$alVEZt`j{MI(5!b-YWs$F zl5B}LfK^H8pTlS`V6>CLD3ibsNC$bJxc6y{4ZW0nzIRe#F4vBGQ{K^z^p_by1illsTK=S*Ee>?N!>VqL5*@B)0QGj|h&Dwu~!j5T)e8#u4uF}qm3 ziZUtvsnxIZ6adx&s&bxD?|#fVU~e~kP{Nmjz5WrTeJt>Q6#mPB|9N!qtAq?*Lc=@*wuru z3agUVj|KXM4D@{*Za{Z{loV6c$J?g>!+@v0{A=wOg${WUyK(}(xsLsFdiC4hX~3gz zwa-Z3&!`)isWa-PxUT^#V;l&70$Adf%+|LtS1-xP`tk-Nd`U)lM(s!Gmt+plsDt9? z{GCw;#D4$~{&l1u!}llgeN@tHq@BXPP3!Aw8nMT*OR{R8@$po9FZHcsigWUzGZ8j3 z5#Xuf z>rH#J3zm1zo54=Z+#AKIdwJ~)%epTuzPxz0eIA(a!s>NZ#EQJyzUZ9;l)o5YgIBy) z9%uAr?-lM!wFB$U(0TP1_Si+lV($_cpg$K?$)sY|E})&uXb0c=Cy{$za`a{(=@nF2 z9fin6mvhh$pKGjnSYvP5X0jmUmV&D$xF5l|r@!tCiAQxt-Y z=j?>Lw9II5SEVKKWSz>rl?pgBhNK73ELTJ|U9Q-}U2l>41QFHdMIr$mNs!60GmT~g zy^%)ktP#l-omva}lZ={o8p|TU>jq4|1jOQO)6WDF4(U$XNI9f4S1}!dVB|xmOSd^^ z7w4ML_kztAt?mTH06Y?*lRa0Og=A4E<4*?f||ulog-|utTB|^wFXK#t<4mN)T$TQQD}VG9VoO# zY;wVfVMNYTVh=<#Ly3vlBICq%;W%Y&Zt?)D}6caQO+UX4vn) z%CSi2ll3NaDl}|>z*kHi{1|f`x1-2-T>l7bNOdwDwoSU2Vb3uGJ925(}t+De<%P<@<3m=$4$hP#u zMSTukZ|!=8Q>6=HcEZg=RU!7V==k4m-)wECw7kG+Wh^%h%pIE52F|9kt+{jWhrQPB zOAQotu(gfZ1db6c3Uq;c?b0j?`r+f6@~Zc^ZQqO0iO`|ks+D}ZU6lW=gz+5{Szv4X zOmn5Kq|Id-n*`&K0gT65saqE(r*BN&sNAz-(>DNwTFK_BiD+$_S#jLv(oAU?rWKVS zGIU@&p9KCV=^7)yYg~Npl z*RNgyNI~@w&_Yp}>H-Zrc8Sq;c1F|^1_s)hxn=_gS&~GJ*V>H5;87?LzMLvtsX5f3 z>ovABkv>(p*@R6|qXNs9N@2+)N`6?ItTY01Bk6!2egGq$IQhw zUSyGgF3=d3D}BUzebtZj`D;B6vG(U3v}q@CE*DCW2is9%Jw1UGAWWP;{SGsIolZj{ zxkc=x;KpWM*=dgQs^C%m511hVvjuDb2NsZ+4-p7Ec6PZ0t1i_E@n}KKOd#m!e~eOg z)|4uW*_53uH|s^B13z5zW7H}`(aaq7V4!jA*oYa#O0?t)x)RJs+#@)KxV3^#S_@dO zWnH;%r+CJL3G9F>J4z@egq9tJp5nMY+SOYZ``{3u6bCCynY%AGYUk?}TDIXhRg6#n zD7#{*c|TmXlQqmL+H>?##>N&nU>q8^WVNm4LZAHl zI7^S9|A5*Rz%dXV0q*(tNOATN2SA37e^we2b;{7A(1#dn5NboHKcRFz4qq1k#74x5 zM)+;CCB~)22(3Vdk-OvHW91M-YmyyOE{kxSb)&^N>SX>AxU69W@fO0=-Ry#P1{Nlv zJW{gI(y$C_3L(D*$vf;$pue%ANI@MDYH*l`J*SVcQPh&Pwn-n^FN9DdY+LYeSbS`C z6nXP#*~BxA|JZxxXAaKD#?M#~C;BB=z;G07ow^716n*ZMvErhkV6Tnli8>4RXaq6T zMDaU^JgkKuTG))?gI-w{YUE6C-p$N&4a`{b)-Y!QWw>%?4XELwF2t-9*O1!nw9L~E z)*mYDJS6Yj}}Ly8)oov6t<8TUW0kS zldv8IqYb+R3U?ZoM`3ALObC$%ff+Q;5{Ab4N!U3HeO8PgColPoVSw0~yxwpf3X$>0hGv&~^*^ zJD^~E2;uHPdk|W?P)as33+ts&R%q zC^(#(-q0G%t7T4T%6Q+4uEE$EhKnQUVHBw;wb_eS#k9hUA;;$>Zq$nfaWLeCpd+lm zCh-ujdyz*Nk+hlihA{FxR5W88dEeNeo`J}H5Hk_p5`w!`+-+Mz3sJ1Xq5f4s-MDLjzIcN{2iD3Cnm@S9d zzKO%`!Jc|Ti8R=ba7t;#;_P=p#QA6rjS%W zRhVmn0%Rjy%Vdlq$Px4uGzlUKQ(*F$MgwOtC2 zv9W?^4Ca^3`U>Wc#7Jk3VZ8}!0v45GGR@M7R+Q5>_uikAJ*S!7|A{OBc)nI`fp&pe zUn>psOh{p-AYvJ}ghLjht2NMzpQ^TuP3sY%vI`IHU~FWJK>I+r67-APu(!uR&!Ao- zT&d_H`iTi@;ui z#inYFg=Q-;<&-#x*0KIh8uGL==hMC03FutsSdDMdzkufY7!D!mh2i)aCj2BFVJZG0 zeLqD6MdKF@> zB@juVrD&*3>RRYl&~#WT{k!ZzLY4-r)iVwtaV50h005Jt5DE=!6MRjo!B7Ik(!{Ig zA8k%ZZy4GMv;k~wv!GyL#Q--A14d8)bWv(JLP}v5sDV5vCSqgI4xksLMUfbEt@)E^ zxr>m$jerX@3e7u9RV7Jp_+;Ut~Zi0&> z%>@Afo%IB)oIXGM>=Bb)u7w~cEPM;gEAx;N6abuHa|l{r6YdhyoD6P`fRR{PYIH#` zw`T|PnT-@c^}Uaw52GEvT3uFX9_0_#L^BJYQb^`+67Gm(I3TmSpz|f(-&L4?UB&BiE~8)7k5#3ezc7^Ke_xEu0(PJ`+k|A#fCL>-18X2>L3j%xdG8O_)yXjq8Y0cj zs~mD-ygb*UCmEbG$XNe2CNRKCn#zFIo|^#bXAZsbsZ(zr8Gqx{n{OO>6YlLdZmS~` zhv1Cg9v9KN{#(ecpTOx_Z(~6P;1RRPRW@3(zL{@_y)Y&KMpO&dk6HI{v8cB#AiB4v zj_L2A1cYpv@*Ui4F(N&ahrs;(^l9!(0*{ye1~(1ciKC|!Tg|Y<4eCLrPE$BtcV2?0 zll>+Ta2>gQh@?RDIF(lX?RM(zcDnWQCE#I!6_nZ@YETqc2LaTP3F6Wm13kREK}9h> zWbp8NsJ-u8bp?OE1;cqt(wBKq;!Xv^{yVs$gBy*@+GUsG5J|ZerN7ER?16?&s1}sX z!zd^k*B6Lp4d(kV@KNcs&=gE-Fl0nms`}rug76Z=b$7HKZ&V%>otAIm6QzDg<`Dwc z|IV5?oqlfpynTjKN_-oNY{LmO7f#6f@}FUWKJlnk@K6($-A>bDozGcho!y2AkHvz3H|R7n^Jk^fVt=INgmv`4H!H2a1%^84J${s zI_>4u2yiJZ#lT(~rjv1-yK#A`W7HcorPLOrSZmoQfKQ9CJ6p_|(32?GPit{djR{lt zDcI?4WzI#gZ66Iu8dye2Yawl$q@kZd+E9@8<4D^sX|VX)Q0`ku+aYPgK^n)i6AY;t zeHvB_WGH_R7FebwGk#cP1vxU3!_rYNBRS|}9R)8F%t=V?7CL7*SeZR)udI9;kPGZ2 zbmuA9|HRdSYO1{n)*i#&ChU|EFAH{b&Km)v9rIv)g`EMNxH1mLsLVi;Y;g0j|guIs%vK3PM@)5(_kzVha8gO2*)ALGwam zOimW&NNa#*1$}bp!sScjqQD^yV8{!ySRE1slGl5NxRDwJaTZX;wLUhf*9()})u@t> zLdWnWg(1KcScT$fwa^sBjG>u2LTdED=n^20FC1ENDhn%h=^aV&g1{VSJm?81A>wX_ z-MMUNvwr%>G%4CFWatr2qTrH5)>(w^rU=IT$%(w2uhL2gOiJ#uQ86)V<)bo-8Vxlbgkmu64B-M;PKxFTQEGxBxqHGIvr61Uyi8}QL{ zc{=dS%mtonm+7%4wNbflo1B!1@6GXKXO3H2CnqVscuJC7^#jgBuTa%Ze9q$2d_O$d z8tM%;*km2)0h!3wM?Rd`?(I&gLiE*gZxOqxW%2f_VYqvg^*1^5LI(T{ebjy#TD(f1 z5ahyn3#z}uxLG>q>0F?5kKT7bTaQk$b1US1&pP5Jsmeage?c@0Y*R`5Fwi(Oa(69sVJC>6o?*NCO}>xbDRsf z5FRDL6DB!?D|`B1LpmOV99)PRl>k$ME9q6qv5G6y7P?tGDi^51S(aV|R4gE=CG|Lb~NdPP^wDK4(wA@JP0QKR15N8Gibn*xX!Ehiv%?2lp^aIRb>{F&jp=E*^^;;-G zIPAyK89ajls}OSjn5lK}Hv#LO!j;+QI+34cv0eRbNUfY;=H#B3YONRVmg?mdTvBEh z(-b#EH}iO|k44)6pnr;^`bjz)U^7Jv8v*Vx!7(T(N7TXsFa?%!kp+g@fPZ|e1hNt= zG6!O3InnGhGub&aLEUf`-R-5OkvEu60h1R{Oaz?*CVkwm;$B_|deI1BDs{2c(UL7pLc6I4 z&~lkobpCSioW;i>O6uN(&II?K<+@V$37f2+oOWiwXoHAAJt6(K;ar##w66(@%R_MZ z$8YpD&}PEJJ9|a2NW2v7;kt7X_jch%JiycaWreJK2 zd-7l|LdA@igPWx$tT^E1;ZlSWDh`TRBs@2s7js8kn_I4w@nnvl*0t&#+=J6KTz{`u z76NbXZl$tZTxr072R5V1AC$0KF-A@(Hv|v{8n@UNmG6^&83aj;OMZO$UXkDidaJtf zv-AC-#GTS|y;)N}4NriGNruw4X8}U-5}GmatPedyiqf-;f|%kE+K@5nK}@;M(;1>O zOlK3F5js2Q;7uw+2k?$aNaIa4ipRb`xyZ!}RDs^n0%a^RTy!`9@iybmJJBcLMk9MlG0^~3Xhs21wF*RYlK|DYoz zf0aHtYl3vz3}F}GQ*INme^)V7CUummgyNhNf04)4!zfCMx{ z6sxA=ga$9610fRb;lm)%<5t1i1vT1w;~hh+kHYi?KIAo|JJ1e@^+LFn@4||pV&HOg zH^3Yh`f&%Y5zpZ|@zdY>IKY|gK`rqKuaQH-c6OFvCmF^y2mDJ9YsQz^|IQz=(OG;% zR@%jL07L7ad{J?P@rD9k{Q!fmc#}KrZ@L2I{X-Hc%?2lu)|2IAP&4i%L!2GJJTiBJ z51pYo`QtY~Gx*43GM;(OMq2-nw`}-? z4z@?JxXkWwazh{9oMh}OU`RA^S4nd|8kSq_(D-;^5KUlTar1|d703-7+z5nhcl8Tc zV`#Z>d-5}FyoE^?bup%2QdR@nvUVIyEADTvz7Cl*bj@CL0azBh+Yw)bdJ`z`pX~AV6#vrdHhGQHaDslFJNCOeKnB@~J_#TjN$nO`T zc+rL}f*ECt;SiW)w7^~L=7=q6qK5q|HwhKYx!O4}+CykRjeEmLL%TWPD}c6Zc!mWn zCk5AzDXSfCP;0lz7`feh9eu?^CHnPl?@oKkcI@8k%qhlf#KS)U9XfR8;xqk=6Av{WW%d|fd)^g<_kAlGlA7Sc?#O$OyKp!X1&J83Spi)0g)`t zQ;kVIK{lDBTd$nIem>xs!34i7C)rP4yL$C)318X3yO`k2ecV{u}3ScIr7( zcs3~BX;3QQMuj4g2DPnce^BXTA1qB1eLHdG`bB9wI#q_^N*Gu_&WjqCZjO)YUq=1@ zU=8+00!52;DD>+3Um{jhZllMs-OXYD6$W|Y4ZQd&gNGRucpyOH2!nYhP}T!8YXBdb zeN$Lfh3XpQ^pZi>f`-4#oc{z)hn)TmMo?*~-=;%EYbTdpcyWL)s@Q2=N{-H0z|b0(cDlw^HEw+vV59v#p2|bx@<)8dX%|RC zHdi1G_No^ldnS-33P=-uL|Qkx2FT)B=EQ)?=8jjZ$2?-FLlCXS7NK+JDbd&67zACS zt&>ug7o4+u&*OBk9?{j|Yy}3UXe-TgB4;8x_k%a?@egjjmw#}H6Hh6jPh$4~`EW<- z=eU!4yHNICcVihj@Gl`BjOQz`fHxb9+XW+#^|h4~91gdcK1E-zeensVJcxuh*gi20 zw%%TV{tke}o5T@o{#*`EYUxFH)@Io}?#_;CZg)qXx=mu$>tKt9v zViEwfX6fe<0@$D&9T$-Cpe>Ew;ET1R?YJ9X4RvfYXVM z^~I8%2YG>~Qy}+Q#Dm=w4uHH(!(@ge1tsoLl`=AS$pI08{$r+hLII1OQE5Eb04w-z z8p1(ZE?}H(l*z3Pf_*pdo!Q)ri+Le=UX?c43rN=c*mw4Xu_RH9FklF+EW5)n>4DJ@ z4RvJZHerTFpnc$e&8QrR3=7|b_@=@=kMBq|4BrsR5A5t@HGnB%m-}kNuoia zj;rd|y7M*CNyjX5)O?N>+6}N0tY_c}Gyv`vnk|5&8ikx64*;!=VT>e#&?k3mAkpvc z2h?G$Z36-T0ocFl_xNC8{|e`!GvN9}^vS2J&luMSWRe+whQb@P_l55KBix-aOaH`E zW-Ow4xxF`IL&5w6GZ*YB2x;&XlG#1OFs#6ydP2R*rf%G}Zrn&WZhJRwGh~M&-X?Ez zl`Fv+-+?sN+CSe}ce@884ln-*WFJNyoE`XfVEprD&X6O|Icq>N8O))CCBK}id(nX9 z+8MB1&k_y@#WfT>o@(xgigGqBcSMB}!$m$2a1cw(x8jqNBtKf|$;n1@Qdlcv!nF%^ ztrv*AA1>%G;M5zXQvHlJ^D|7lfj-=_8dq2fN>(25qNutK>@k~jo)Li2a*M9i--Ym zvR`XJEQ+Ut`64{;-u)6%LCX^J_b5lbo9%I$4K*N?-yX)1u(EM6+t8IFr?DfzB6bwF zy6@^Qv6_7>H(=y5GX}vJohq-W>5n4Sk2YZx0kLv<;f8R^SJZPboK`|>>fdFy9-bFE zBLZ~xFEA2^Erf)6v_K*99+WWCj#g{y)6o}t{9&S>E1b&S2z!*wi%9quhZ)5v z(b1e{jOZlF+c|Rlkqje`FoTctH6*XT0+KT%0u(+&w!OV$^@O(rgzVw=&cM8FrxBpE zcLgSHc>Ky46B9Ri2J7A!geYlfb>n!T1J7U4AiG`g3V5pBV1vS13NgFA-MZ!NHsWg# z%G9HQ~l?Y*lfy}jteM0=mNkEUdJ5^BKP53zL=2B=?gbKZX5 z;(|Hc9{5lWqMWaYm^@(~iNzyh6gWXh4#qi!XTZpsXUMVwI>96hrdP=?=GzB=e-pw6 zXH^tB=sfTcu|^hJmCfGPM~U`97k8!5{vqg#qj!lrKf;_4BaE+}#RFrwlZAesY#)X> z7oUT|<4w@ZB)l!&7#>0!_C_9M+IUim#$AAzVb$2XaE1Q+D_MTxtgl@Lw$N|BWRfD zjxN9iP|%A6!l=@F>N9qrFb(i4x4$U6HxJoM4$ck@MR1b{=3+7thy9h%-eMJpER%p2l6}1zk0oZC@}L5)s`FNWbc)*tzpJ543Qg^} zdN2Tx-WqlP2K`@OO@ht}T|GolqZwmIw%OGivfL#Rc=nZ`!JC;YJb`hvqt-UpCVJK{=ycWyyKygV>wi+3$49 z?LXxtM!J2YZAj;g_QjiC9s=x4Yq-FqQ~&vtXpkPr9$U@&hCDa7i(qxSy)sUimd7aHGY#PxZM z#5|AjP5OG~aUw$z@d+g-c%h~Ob$Sz*_lc5?>TD-{ly!*Nyy#^_+MrLP0Y7>fj5uDZ zDK)AULEn6Y0%yL@nuC;_9|h!e#wYp|;azQkK8lL^^y23j9Ht}6@vd4;A7dOzI3r2y zY2JFuIZ??8nJ9V(;n#cYy9gzz&Vw!jmt&5zK0XhiKSp0q_9H^Y9gGtxkQn$|3(5!w zL`-m&m1N8@Hu@6_3YEBt=p%N7oExU>qEGt-lrx0+HNP>jQxqDI4Gu#!NBJNvK8#F} z{%m+pa957E-=a|CAy0_bBIbp5(r3A_ehvM3kIPynvBW>NOwE=MPPZW|41_j7&tSZa zfgCj9&K@z4Gdghy1|T`#g!?d?6mIN%`EYw=b&rt!D#}*1s>Q!?@jGNH>ZdvJ!_2;wBF5Z$_W^RC!o! zLCsOTy^3q(&X`;kKOwotP;Ua{+E#qS+MpU0CYSn#3z81~de8#35w^rZ+aAPiMch~M z&T5LUeu8WQF~#>>J@)1ukJkU&7%#JqcX-t<%%EX+cYsXXuAcFBr~;V$on8*i_JLr= zc5}vXSw5`xpx1G{F0hyX`!KcD8uCaEbqb z_l&eff;R_&!a)%z9B>EKAu$o$iXKdRJNwe6k!Gy>`71!3YxtJ_mO703xu13i`)N0B zbXnTPj>>!R_l@{Sw)fsON*WQr)&*2-@0*y%5o#Ri~cPRz%Z?bmNF#{6Jfs5uFZN z>5b@2(8_)HlZ$db4+3`k9ZSU?^nD@bqRZG3a`OtyeZkA zhv35tpCI2O_r3$NexG*`ZR|jvsqVT*VIn!GUS(e%NADGG(R}vcs_Sl6A5)jrl`3xk zpv*kr!qN6P`ZI_ATvgZ5g9CV&;p1Q{jzXt6pssiNR86#xb$8|s#9AB+M#D334C{U~ zn5Dj+qUD>^J7Cs+-P_yOw_iu!UX%SAL)xEs`}@-V1Zkg;G`_Waa`lwE**hq`os!=6 zk1{5syv5wI#?ot7VRt^+`e9OfAY{M@fY$GW z_=d3wSOOt$kO6rsLfr)3jrpk~6egm}MF(t;p2;gBCKIt24AB z>QcCrmGN1~)}Xl%F)5tgLaogF_(>?-FR>$cD(HuFyCZ-a?UPIt&YPc<0Q0s^43;jZQU#z zR~cD{AS?`~4jq|t=WDIJ{B{!Mc}#%DLw^1U{ac1a zJFP6*qy!W1li*njnnPiTWrerk_%`5iShn>t!1}wj`3kPI7VzUB%iw<03rC)lfp&J# zXIXyT&F~dGD`=ib;11|3;!Nq+x$Qqm$EL#{xWd69!*IrA#4#&5XWnifWw7|%6ztxj zlbE{TxxuOV`POD6b$*eH8zQ*=lb3GJ*Q(QjtfBQx|E2_&B}U!_PGqnp_dYd4XSRb( zh5k&qr7DB!wSw*X^mck%ANF=1%4=*e#QRB1$Ww3TKqA;L!WDvl3*)F((_85Xck=V} z$vvFoFxEDJ)TWG_pwT?v832%C2Y|?;*oj~p+9@2@=7S=_f*&Xv<0$4lW$M_lJgD6i7o*Bh`eaR80Pt6Q379n>oGJ~CEnQkZ_%tbon|OxhA|bhSGT{ zXF`Q_A(O_pNBA}yZ8%_nhPO(T@_Q&e&?4rWn7TnsOQ zK~LBDITSIv#?vIw=4O!{}+kCOh z0)d%A$s|hRCy&2~*T@ipAvf+UMb!}MfyS>}&#$4LVIjdX-MBwM+@>z!gqnqL8fpDb zVMdfQd>jZu5o)$RoPy$y2;;Ue-W-uvYXm&^$&3s5Vok!@j`63k7%_pL(-yyiGCN`#ZV*r`ayPYR06p%W&_yuTg=ZpE3K44 zxsblW{P7W@J25}-{*M%5ogJy&RO`gLhM#)>M+C2UY9^5MO^pwHICAJ||9W^~10oPm zpB0swvG_`1RgFeX3t+vB0UF@Q*)&?XG<_Vo&d|xyImX!#he8tlD&l^hu;&0mp`^eb zK#&pKzez)r2K%fCApqYfgkL^VhCiG{ljrdmzUMB;Zz*NzMdStjm#msHK$AfzwmBU| zTjSd$ldPX8yYTZ%VoNA_qzvq;#h4luvF2gci9boxpWql+Gjs88JVNKFevVBF`u!97 zs6sPuto}58M4^7lNT{%sMtqkfts^k|XPM_)aO{M0kJfn4nzwU*hw*<%=R0uxT=xmt zDS1IQCeO*5+p&U`X_e;VS}Pv=(3WCe z#U7djpsKCSt``G;-8Eka}gbY^wHee+^yjvLkQ1^`$M7Z{{otg*4qF8 diff --git a/gemBS/__pycache__/database.cpython-37.pyc b/gemBS/__pycache__/database.cpython-37.pyc deleted file mode 100644 index 3f73ed5945add9f2b39d7c91a7be0f12aa615f63..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12820 zcmb7KTWn*;c|JGZDN3TK>)yE9E7xo8+Hq3XUE6!p+Ux9Uw_4fj&=-c{98#1hQkrwL z=F;aRK-X>EAZhl&2>ejARMCetD2gH|iXabpNMC}!6iEAYkf#(Vx-WSO{1Bi>zyF^@ zij?id6?Nv!nae-_{PWL$o4GnRmelY!`|rKCy0)I=tutekoO+;*cBk{;AC53)ZD@30FuD1L{ zM_SN>iEHhxtx8KQb*lEZ+i16{RNem@AoC-X*cu^#!F3F*V^E_MX=tUW9qYySv{J%O z3R74gXpi(#3i*hLaz2fGOvE`qhI~RKIiEp3CDNQ97stey$b6ueCd9ay_&_U7ib;{h zJ1g>HO5{*8C8os;N^;`3m=$v7Ica6E$Yf64oBI ziwF(Gj4TJS>q|GTynbsnNZ0MHHK#1rYK^AdJwfRUD7t#@<*l}`U)-{{+VbJ)YNfhv zzv!%2q%FD;Zd8Z{i4CXSDie}HwAyaf8uh@e+inmOYvopD%MOx`Tam8wUc+4vVr9U> zI0Z8>t4&A7fV+$%`Fl-f@cK>R!t0JLoeS%B#a+Kp#po9pkudGkqPYDqh&Z;pyFG2obyI{db`k&~Y;h8lo&xYaYmsCrO zTJrK3%iW{bDy@90?dGLjZ#b?k^I~_q*{D`rJHKf^EJT7xx!JC62C+(YXSX5kU;?Bq zSKC|Veh*2=1TkqhZE#S*;t2*OdV~J$tx`;Gm79QlaL!2p2w@!s1c&#$k<^nQaYpY> z4d%b!Zor&GBYKzyd1QuYz;k_3m6(IMVg)RjMN3JmDRY!eBPkeaL1w6c*kfo_lEQ9= z7@kIrLkp?_!}u_E`)vkXKrM5;RS*En(Mx)f zJzcK5@m`8@y0)`^U)wo)A0zbAM3dI5of|_KJg+baA@)8O;*WPSh5ic+wJU~Y1zUe_pY?^Yx#h&$eeFfIu1=q@3uDM}tL_7^&tgD3e z9gEgkN>A&>c!z7GOt5~ZsckQ8yJw)4G1|&AXx+ngxv~0$7_wlL%w5?CVs2y0ZtuE< zgj#_K)bWA@F=wZVmH1MSyw+~D>?*B95HFV-t%h4J^U4KdRSC&ym!UG6wga-S+ttm2 z6jcOI_TmS zy6xWqbE~qwJpf6}%wR`h(P#}3j^|Xiwm&`i7ay=%X*LfI=(wW;ZnxXb|JQ)`f5!|# zC0VJuifG{s4-PmosUgx8OnC&0DoHJn!r$6zxRBEqsq8c*2RN#TvU_+O(y$uy6RDA; z-klln&R5HQtZq~A0vaE2N?}^V)nq7>FX}-Gwf6!CXY+9&Oi!HOL7|fuD>%4 z+L_qjyhDAf^Sm%2kuUm@&N;89xfCo)8f#^P+pqs5>=C)Vq+;S}(po9hZQ^*0Z5)-_uB`w78($(5>tKv$s9t$yE zzf!E-6lAi=lPD}js%3ohq6Q}9NnlF*Q-(sS+TMOxcH3mvy@g&5@dc!`5r?Esvb2t4 z(dn0j5Lv*xo7-Rb{mvui!$5wV12B*r@`D#M6v`-p$rQ>82~ zBmr3N(>nYFVcbUD>DPvI@3ZkU81x_;UqIP59%X9sY{0_%rX~+FU|lj}2CTb**%0e) z7$gJ9GADDvH$o)=_Hv4DRG2U%kY~eCv8kz8(=%#1D-rI?#s_&Q+MeM@z^F+PpWu9o^GOe7n}+;3m!-KZ<)ID=@aG=-(wlKi;cE;~(Idn^bx9K$ zKg|$K`KxdyNtiXi65}W*OePrOm^VhFJl5p{H{BZ(lh~>mZl9sq`eR45q_)Kbj;DOOX=aqVfYoky)9qg=Z% zR2xSLLkUMPnFNX9NBje0toao3ne;1B3*?RWj40tt%{_sh^WB5^w zH90a?Fc@o6jTJ(QGsakd#yHEodjtY>qGmAnl9)Z}#l39!O&FSwO2`&$pwO?7(q_jejv2G!@{b!bu8jjTf%0Wz}gzq+)tq)fyYU(BBw?)61@ zAY3`oQDYRp$Qb*53bRUE55gYu7+6*2DLGCFDLzSYgMCdc6Zj0Qjdr6I81V6-)wj06 zEJ3`_20;Xhyt*Dl>axAN9YmV#_iQ+2;53fE#O;<)(U-aMwzRhqzEP}KMiyLzz(Nqa zd1pz=wp8Z8lrrZ?gOSp&u?w&&iQnpaJy;;f(EdzZg6jWpn>S1yy$XjzhL_+3d8o!J|Q3;i&Z~D>K z1WMCzSaO<$cM{P7q_9EA{8Z^8=W|9Hfg?PClSFRLFz}9}Kedd8B{Tw+%pj$Amg6Cm z4qG&AZZNkEESf{WbYKm9j4evm07dCw5tv9#=yZgH5V1pCk|K7{>-}$wzoW2tA~0ZM z4d|fGCFoa}w`9N~G>{;@yb<%EyS?~kOm29Ix(+!D9li4^y#Y7wB}h*0yhO0E3i2 z=~><+wVl^Cl77~U_EG{P?-_U}yewuk<)$~rdKqs@B%$w9d$4HyoJe7e9712|p5^64 z8fDX<4aRWJd8A=4aj!`)$NKh?7(;#5n|!2;%%+9t!H?x=e#+TKS!4EN?Tb9p(S%b7=@9dK5WObz%gVP9zy`RbLGYj>_M zbz_(BUAcYt)>1e6zF$P&)j~w6G732z$bC)~sWPScv{LN$$}8ooS8hL{sLkiJzlo|+ z;x@wBH>qhUdGF@(E4P*?-<^4V8$Jh#|Gsn2@saM-u=ozEx`kn1tKVKZTx169{}&1~ z2gEsI-5d+Jj%YZ`Z6(G@00bFT1?4R%tQ|xVW`JZ?j&v%lWPy!v?RK+LwTG0$Eg}Z% z1#T z{Yx8J!@`$^a>#nl&p_hm@FVG;!Oy}o16iF5zoT3>gL-HNV*#b)rxHAby2xWCxsKm( zaAMej9LRKvcaWz|3fuV+0*T5?C8<7$a&d?N5}w1u!o==AM;2;B`wPvbQUr)aLT<2IcIKK^12&^J#Nn9bm~L0?9mUh{rY|FJ}llp)>gDP zH8-{qhu0U|NceGhGLXarUywYT1b-(*9P%%TRw+LPnEi5`WbGM`bog!klCe5@mt%wP z41Z^!H^%Wy;F;{Kcw@l$HAwrc7-#7i-T8!kJeNE=cyRDkraEWci4QgMEep+nW5K%NO^`H4{cD)1%O-k z4`=+@&Q)&~eO>_M+=tp`3{v-dkgnz(&7W@N2?D5WMWC6e%mn2izg4Npu)hHrI4dxx zx=E6o%eGMl|I2B1KH~L+*NQRz?Cpp@hcO{X{X^b@$g+p^u6IaSkoJar7m&x{gBtKw zSO@rrp(PFrjN=``8-AiU5AAdjd8{EIE&%Ic3=M^$lIyKF=7AE-BaZ%))EBh)<$bNL z`A0yHqj(>~TIA7AVen7x^u|nYRzP1u`oH;^)HHTJhBms?c>%gXQL@OCJWQC*Lo3K1 zd&hcnmx0X+U>V&bjkqx(7XnQ%2_QM+OeL!uf&z z#$`NAvrqaffH=sax9As$1yH@bf5t7!wQe_+CG&+C>EfYvcz~|-Rt@gmf#pM zgR+o&7$(!>vAI(M%o6xf&3hU1&iN+}V5ngK&^zI!H#Lm%EAI%fVhG+*@5CQy{?XRN z!I1&`hv1nbk1hFYv`R04+u&uN1f@<1(?89pzM6lgbIU#Co#ef6*+1Jk?49vWKhnKC zxWi+I7^=UATAcZV_;1# z`#F@Kf(Ji`BXpDzxA}{-+weZ&Ez&v<=CsIsd16*Wxd9HdcD@D3C%sdEbQr5cys$_R zR6j*&wE9oB2@9|>QXxIbLOJP!k2a2Y1*As-EwW=^CL|LN6CdF~Dpm&1BAtRIx#$&C zUs#ek^gic5r>v-uo>|M>KA$Zy!SM0#dBV`7g{)r z_b)JBi(Y@6bKV7o8>sd#_lb5d@g5*NU-K91pxVZi!U0m`Ipna8s5^`OF{l|?Sz5K& z#xa(n(VZ?}Mt9!DdT71V1Do%}a`)L$qwX_(<@}iuuUwM3#pW4XbAc||a#RMz+f?^u zB(NK+HMSuq!`^g`ffHvO8kFMCe)%H*1?e`vac-@GvIr?tMx9H)3n(`ZTAlrXdHn93 zJGaVLuiaSfz9iQ2>dY&@UU^{W*KE6$uQcJ?+9HoTA_(_zu%`+i*LJqsD0v#^>TA1= zrhx6#{lgWqcyY)s@@n`z2Z`wPz1H4E>qe_z!4B_!An6I20E5Uc*|$`{M;>G#2)ze- z@6q)Y9O=g|GXW5{*V&vy5Jm4CuziB~rDkPoO;lcT0L=jmt$TJqWX|SDNUP!{oc~vv z&P&LsE+dGL2~DABKEB8HhZ0QDS+Ks$LjngMwZNfIa+;dLhHgH{Zy*F_bZ_A_!vQijnDTA?{V9gI~;= z&w1I6DUCT>I9%m}qOvj$~0CyhZ46n@4Gu%K`nxk?#*LL*lz z!{d-UMXe5_j@%~-n$evBa>dvY!>kCK1+*h)4X&96XG@C$qmw8nS1pIyPq|$o^AFsv zIieY!Yj~W~Uo4cSLT6DJ%CXM*LFgQ)+ zf>L)rLZ~c?u!_Zwz+t#XV+gm6i3qu3B8sqCnBQ@^9|C7SjMw&f{Bk(KZ}rA+RMUeNFgki6;w8bKvl;1HHWFbEg~@R zs0Z9Ek(z~afzt!0N~GZ|?AMHiHRn;28L1f$Yi^)sVx(p=tob@>vLiK9VNDk`IjTV@ z53@&_Mw+Xc{*;#m9dXqWfjASPM4G`Ay*aS;eUb3TcfJqf19WkI0n;x%6q<_r)7~`7 ze@d+o*wS}?$}J7K=uY$|RXB@RV3u8*MI0M4UEd+26K){a&3Ut7U<jGXMi$eCZ3 zUjfSmsqmXRz*(55VjoTeBSGVg$x8vf_rRc6T*edEyl@ZfE zSW~90__7w%IhZqRPE5H{< zHmM7XSc*JJmZUApwJA{%r*IY&aguZl&!QHpVY^-mLvBQB=~C5SQ?g45*(;wJU3wW6 ze?y{|W;BQ{4aJuFqH;L0M8b|lB8kAPhO4Fr9|*oL98U@-G!SfLGg^39u>B=6WKC_x z<%A1wgYE>lxYUB{iH5U?5>wq07`a0L@5d>D=@B2oje=5mDhtn%HZFW+J=cyN2#$j~A1JDt1@Y zFi$_t{neGG_UFN3^Qr9Y)@qGT9$jcuxohL%*f4T-%h`pifWqMk<>@euoPqF=^!xyg z{!HoJ1|<1j!vV~Oi!KKsg?;lO3JnLI2;Gt#p4D>q>~cH*phCC4t~Q+A=Jq=7{^eg; zx*ZbU*|w{VTEiB{yQiphsF9+`)-LXJfnF8de3Rtk^bdA%_6rhpPY(6CNk`dgE%#lX z7xJf^LR@|u+!!PqPNN0O3TFbawp=-6J|0KCT&3g^8eX&aV6c m3WLb5+kh`Xf=8aGga-aBjNp$d6264>(j3B6UFcG2I`V(#o@u@S diff --git a/gemBS/__pycache__/parser.cpython-37.pyc b/gemBS/__pycache__/parser.cpython-37.pyc deleted file mode 100644 index b0e9fc97f94d24b7bb3ac69165661b34bef184ec..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5495 zcmaJ_&2JmW72gkXclj+*-?E)Bt`nPXA|*YffZ?=)WAr z-poASt5k{#zS#ch#y_qq%D<>_^5sxjLy8_EV+vDyik+3Jt;)7$YqHdBU6wgJ_hW@= zO#eh-x~J?YPjowvo*c`gC;vpV4W_=YG>xa=sI0drKlyZ2){vsNk$DQO0PbbUO$C3+ z!_F~-nV%?jo>{Db(qKhaLTO6A6;W+_{jKQTFxd9n552?n7Hxt)Dh;G}k)j*O5^Pv#MP6^4(@r_b!%3jOD7=K?+0E4tBacU` z9nXzBtF4fEtJurE_b2(lS99QFfreuFlu8UYX~p7vtJko%Emfq-ro5=b1V(5@lqL0oz!6qW((73~N+C zUL30cPy@8ZxU^Fml?mFhCIKB@IZ7V@!^X{{^k##FUepM}xZxf8QM@8d=54!sy|`Hs zS{RAk0AL714n`6#k=qIVKoo!O_V&CVaUODEY=^w>#-i*|)9HCGL(Rk{h2ahcUciJo zQ4J^HL@^5Yc+2aAVYjJqasXil-T`S=>LSa^=;sT_Do9ZknWC1}>uOz_RjUO0(sQ7n zac!wYjgLxFx3oI=7p!eZ8L5CwakN<9(MJGPf<@M*t~c|U-1;CK@OjWr1V+=z*f&)v z@bhvmgO+5Cq`gDV7oU+xcodaXJi+kA$a8CWSRs`2Et_FOLrrIPXF>w2y!|T3@$FMjp}e_)b%(w%G0W2jeieyBdcS?l)BHa52rS6Y;GKF-rD#$-TZEI zBi$TUQNKg=JDaywZZ}09V%&RwIPJx)?1*x*qc8%{8ux_R3i|^(D|PO*MGo0pqB7{V z9qwh89EsXV)yW)OOrLg;FOo(?HF_LPIMX5f4dzd0;&r5m{F~gAeivLoBK6|AxCC;V zBO&`Mz>DDoOnsuIT8ztp9AZaf+NXM|Cwh*~N&WjcOdWui=nhnd9BNB*KDo3(`3`QV zyNa)*d9XBhg1DGo`B+QuA0p{B5G&SB~|~Pf{x}nAO$zSMoBl5{nhlI;L5{ zeY8qw{ehLydK)ckNf{N!3jZRp(n4a5ien`$vI<5Q;}Waltkp-Au@YAk>&fI?-VLxq zxvL$keEEz6S_*iSCOpclHsLY#jXWxn$48ljnExg?TWnhPuCY4iFGGfzi}RDhSf|@T zN>P)v0<&gi?e9rB$#qTe`WvEsjX18AuN{xx7&KKw7ZPmQJ%YpmW^HvgGclj^9>uyR_%$joRqsi8iV zOdV_dUNSWqo0hf8Se2Tn2j!%kPA9djmDG|7Wwsl{_U^OF15R7+~a zIhjc+N$sw3tnFHll*3y|WwbC>u-~g6Y3VG+t3PiVK1?lS%ba8uCwoamQNQBe~~Oqka-AX7J&?O z0yJ}c?Ql7n0b+Id9qDN?<9?jmy$wV!j7e=k3XEY#$m^hZkmM(v?_``0S0umJ(&7oX zCD2C0_}b_tb^{m}lSO)msl;I@`Sc($-IlMe3zawZ=iZdV;>sJVN9m2V+dsS`Oh0J# z_Lw)+8-+^K;PjjkW*;vjKWO8ac6&Wp@eX6|wqjv$ZxHfWwbz zYlw-L@M^f*vB%|e@tL|goF$@e5FVUyIZB7GUOx3!CK>kExVd?=DU=U}=8dDHqcly2 z*S>KrdYL{`Uw@`H9^f)F@7IgpQh?gAiY&vUOI?t6IUGk@P_cuF{1 zZa=aY_b|+HDemK0FT(!z+F( z!m}K9<735I*Yg1K9u{h0mB^lv9e#hn!-2=0f$Q_=QlwkkVAF92z0hZ(dT!^w8_DQw zg1#tCHr+#OAa(4l|JV|_2~Jnq@0tnIfC{I*?? z4X7g$7Ab`!76#SW7XKOKLP%-?m>u{Wkd@vlWhPA`f_NQ;v>5&d)mA85MK)Z0b`JUxXu&@gUA!n@)o}G|LMp2@8(D9+N zfeT~Q!C1bTZDT)+f=xeidxMTE@<0+{S)HVrp==fzy$c6Cdtl$7wzW-e1b@f-l81W( z{wnq65#F^roMJFxdcnTW!vN<~?De61aOh!eyFf2~c>vkP!CutK>U@qS>3B^Fvlq78 z@~At=aA+=Nu@oIXLfy7Og?N2(HX-;V3=qc0ut>XrojF_Z|E|!nh;30jgjw^CG24d2 zX0gBQqR35dttZsJ(7WDaq4nRg3#~oQWxQmY(cYFr{p!9@yLK)ho_WeaCQmm93)ni? zo{Kyv=peo>yM96( zF)k4aadjrrA3^qQBApUmfQD{nDbh($zk^riNo-B_`fZ3H9;VaL#h+e|^ZB{;uU!Jk z8Rdm2^r8Th|2#4QCVJT>DK3rhISu{k+`98GtSdc#y;3@f&D`f(a*n)3e_ejRmr`taAN=2J#nlth>D71rJC S|AimWOX-9;+yOtWzWjf4OP6K< diff --git a/gemBS/__pycache__/utils.cpython-37.pyc b/gemBS/__pycache__/utils.cpython-37.pyc deleted file mode 100644 index 31d7e161b129d74df698bfbd5a7c716c1bbd5e88..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11317 zcmai4OOzW|TCOUUq>@_QZa?F^OeHgRB97Aza}EPzjK}dS;}A3^c5o2M(Uf(oyGvF} z-M32p(2?c|7Jh&GPv86fKYQ1*{)G?uuZqkKT=Cx_QC8bhwsQJ*+m^f2cH~}a zm*noYUAdRrWw}?{6}ea2RoqK`Z>!d>+1C4(a#i`MrOILbu+(0{b46A0Ts?H!4Lo_O zhNqg8F5|hbmhilU(iJ>4)H0ry@pMA1s1r}E_NrP{CviWiUQ=h(*{62EWKzc=s$rO;?Nxa=pJ;anfxeh|lgccgVVNcso<<}gms8|rk2TU#6< z9PCDVIM_n@nqU74G__hyJFVRN;msfpyL`@nHZnJG#j8j{3lz6VZBUvkD5a~nOrLl5 zyWuuxGrm}4F)uyb8~Be8e5^AW4*QsvsUEkQZt8V9gJ3J{bkcgKvo%zsKJ$%EXJ-`j zvyy73qlVp1=Sxd_n1Li;U%K*f9P0SWW*8)!SGq$LUJ*iODajeHZ6D|oTAbnJT_m3E z+VW;$eGW=)u!@glaebkZxjWwN~g7*z8O*|BMlLtlp*QH)Is zRo%yLFG@CLlgxH+1<@cU#G3zbwlPvU+S=}i+?s&o^f!apN2kSGKiM1REV5RY7Wrsz zH~qcMsJrP$pmWgghiW!*pf|PyjcH1%(d9x@e-Q5FQ}X-KAoPc_oAG6`q^UX@aDW%| zFWo91!;Cohcum@LNnZYVCoA2;CA*<)kkc9K_mq0!*jzW+b{qVhQIga=wiH7H7Gz6% z=N)Nvyc%op*AgCTI;k6n{q?kBTFtmi*D%M_%Zd61XDrL8jB{pR%M1VC_5JEcqk+sO zK&rh1%s`4sgMr{c2e@%feXXwWRC=8l&#UR=ZbRo5En|+4OV&u$%43)RY|!}d9EmiO4_(@l+_)rhkCpt zH&7y12(quHhQFtS?d?#5c|#$V4#~j*)!k!=f^d?EPzX6fJ<*EMV8b7-8~(`p8EQXB zd=Masw!&7IbLEeBlH(Sx_#BeNI3K;;yP`!4I=NP8V5-;%Yl2#>W-Xc}!&TL60{S#4GRF|Y3#ahzJ0utBW>p*Z`0V|; z?Oe-yZQ8A7`MJEt9_D%#hLf&oN3P&9I@y6v=9@0A;zzc>KNP>ZHGqlt>*<{ z-0E`X{P9jTAY616nF)BPWBA9Ol*mOB7inoyK?(qZv^=RItxU>D0X>j(89|~De1v@jyBCp#GXpMG5Uxn+i5Xz4R+oMEe0n${aAOI`P++`~{eJ<&3n>Q=wRgr-Hm%;YsDv>*CaCZZZ7 zDOAr5ypO3ttd)wr_%~k7gt*W_UE&JOsRJO|)+Ki<4 zD9@J*j+j*vJ3FT}kcH%ul_*?=xXyROjx%wd?@)&`CD9sRc#z3|u3~boWY-tt)Y*GH z{*}Ar<4np_n1o%(=4CK-IG{Gm3gSeE!Io(@zI41>vs6yk2%c#)J}snTFY)McHif&* za$1Joi3VxKs2{n*JPXUFwQheHhn;m5k6$mE=hHXiWD;;{j^)YdX+46D2X+~B!?c76 ziwJFC__VCT$D@tZ<)YI1Y%-~P|K9yOMqtED`|PKDpY@DXQH;{E8S!=GVU39y^)E5; znGnCWm$MlH3}B0oiyR^0C%9s`-4G)RoNK$Ky4|o>>=ma{I)z-r_UsDY;m@f%uQ?ua z4ZGrum+l)jBl0X{kuq6eF0q!7MP)tl^a?FtLaDS8Cn>EvkiW|b<!2l z#9s#peWB`zfn!l;H|UQ-Q6qRz5&X9yV-$EGNH3)Q-ODij03OocykHB|EvmJ~FJ;8{ z_u%TXFUW|7kdM#5px}-1FLSaKT+TORjZC2#S)e8WNQvUj@#}dnZ(WSv%3kK4@kJ&4 zG`CnR8y?`* zbPyhwIf~j(am79o%gyA9(ghhi1z9^~pM!kWZQnj)k8@{gjWZ~ib3w>B!-Onm7h?6A zvX3ZA_A?33_4~du z$gtZFgTZJ!AAMdMh=U%iiwj}|Fhk!4?`LpB4Kcw5o!~TWF|f$ITW+M!es!spn-V^Ua0Ggp=UTer#gG#b&H++36e>9 zia`9dV${N>|GBvDseBV!u~;x9o1kd zZB>4WO0M2Ou>OnEv^uHwmZtEtj~cYF2{3eqpk3@Q!uelRryfT-TA6rajXhes;4dMC zA0ads(#ZbHX_ztTy-$%pGrs-Do``P=#3} zw@V0<%ub#be3?t-4DO-5frm4Uj7~Fz`@CMe!@)rC@Lb2Jp7%T#E}9K8>-)~^6o6AK z+(fzyq7XoU#d+tJ7Iu0s?Dywf?}rR3@E?H&1gS)<(Od#yh_2905N(rDMnYKG!el5x z)dqQ>>P#5KJb(2+_>J8)D2UWvLx8IikJi_teYl<)p$OcK5CIm>yi6ivW2WhEHczCb zK0x#L_*P7OZ6i!R6vdO4`{8cbPs_LO-2C``k&e_$AmS9?^i&HwtsdfWr~`dKZzO%a zv#8WNos2=`JVO_fmX>8*Vsg?7gPa4^tQn9{&6>P$%I#{BO~KwiCZi<4=822)aXcd8luKK#iA}G7hQmS8mKE0{4Y`K!79x^u!U8AH;({Xn z5%_tZydCuSf&<)S8y5mkxq8R~MA>68bW6DSYx|1+K8QRnUyRo-Dnv*`8Gj3}^bAgw z<82%T8jyLLMda8lW~+ajFQnC?&Tg+Pa^OFro*dX&H5WYLIV<)lhkU5Ryk&`%>KjKc;yk5GtLHIwt9d}Vo3Q6HIHWoL#xUK z2b9O>#CbLnpWy~7iSe?$2eiLhuU(9Va6+{f=p}ONMuYWX+Q>j-hmDL>3o;Zhxp01) zjxl{)!Eq6SlMg=lH&@oQ#sy#llDBNpOu^0N2Kthn}O?YJve78#yhjI^D?>yp6C6*&jg>w`L;Rh3U>g;>~ zvhth)sl45v;PCa4H8^FXC)=X;*|C1l8aSVbuZK~~@xSi3aCgz`ZP-WRR|@qJd`{#O z4?g5kQtQ=^miYY4##7^H8Fm&=#&b++u$fYZ*&fNpQh3Ybe|`i=CNfJ@RzMZ#{)O!n z;mM8}X&;R{CVrzF?eHX~63oi;aKq4t;!!r;*0|(P{6aLS zHE2&B;EK;6A>7XNgZ0e%Q)dUVFLHr{pTS1R32en0 zUpY4Kc{Q+htv|#WNPHccye%}qcQKIBqR)*BY0ma^K7?hDU;Tz5nq{~bt-FX7aGl4` zSH(enn*~iKKVh32MVI|Mi_T9tIA z1AN7hsReN}Q)Ovn*KNRSMU zbAmHI2B-1>pm%LuG3t4J{KhT#Uv!)ZO?@~yP%LnORTy^>f=Pxt&I1(`I$54EnZL!y zRU}xgoI@O2?TKQw2AE#pa#`P!rPdLLF~X#Ogm(|`7AJ$UV|XPy@?&I_J@v3y7jgXZ z#8b`}?zFc5DPx|ZI~PwQ*zH+c;5zq z0y(D3lNyZZ(xfq2#wlY}mA@!MH(!8mt{r(oyc*$EP{pQ--lU5|U1~xLvva$-m!hUe zJbm;7`4S`3yFO3k*-jh=joW;|5>Dj^)lbgIAx}n zyc_e5puP`}{vI5;0GuPU&Abv*fBy6=D-doI)Jl8_mc%J6j=UraBV}g!(w9`rMqW6H zn?t=~&E&H#e*^KJZE2Xr4vI+_fARnerzp+p@D zudKad&SD^RvUo89sU1mRzg^GXLh-bh@;o+RPIWTEkk*R0+1r#>F2q_zOdnv5F$>S} zkoTPPCC}ChWreDu)+^znQV^9(<+Ne11E>9rAyRvMRmgW6QKk*7UcSMa4aV=FaF*I* zm;k!~DUcZ4urH0EU^rOQX}csxT6uyf-2eTA$IJU`aqgUv%tn?*ce~gmg8(FqCf{C>~O; z7>dKYxK87ep~QBdCvg4TdQyqq&yhQ{V<@OOJ8q1*qd!Hf)ZRBARpAWFi^Fix!3T0*keb&)K_D_Uv zx!ixfh1|jtX})gZirYx`A07a+1KI%2*w%eu=N-p|Eqzkg_idcg*w)yGO~ta0;L-pi zUr$_|+rYK8fwfs*;csgH*!~3Qdk5%y$KgBd1EA4~n|I#7*YfXta_hs7AKd%hJE^Av zs0lc(8$bVZ{{H&=%}QF$J~z-OF(GmLZ~#cpBqoQYEy+hv-1V(Ibq~thQRMhj&}8<~j3S*dq=t;%z*v zUOeRKXS4xLt$)D69VDp}4UIXKsgft<`!I7#!l=5a41okjF;EDEp=BRr>^TAUSSlOn pT{9nlUgxWr6LBH52ECfJ;jFqVFRiYu^jEe3iRv)P7rlyC`(LDdG~56H From 0f4fee1da1feed89bc142c12b7f27883cc2e06ff Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Thu, 13 Feb 2020 13:06:22 +0100 Subject: [PATCH 44/61] Don't check for dbSNP input files if dbSNP index exists --- gemBS/production.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gemBS/production.py b/gemBS/production.py index 79aa6ff3..286b690e 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -215,15 +215,15 @@ def run(self, args): if ret: logging.gemBS.gt("Contig md5 file created: {}".format(contig_md5)) if dbsnp_index != None: - if args.list_dbSNP_files: - if dbsnp_ok: - logging.warning("dbSNP Index {} already exists, skipping indexing".format(dbsnp_index)) - else: + if dbsnp_ok: + logging.warning("dbSNP Index {} already exists, skipping indexing".format(dbsnp_index)) + else: + if args.list_dbSNP_files: ret = dbSNP_index(list_dbSNP_files=args.list_dbSNP_files,dbsnp_index=dbsnp_index) if ret: logging.gemBS.gt("dbSNP index done: {}".format(ret)) - else: - raise CommandException("No inputs files for dbSNP index must be specified using the -d option or the dbsnp_files configuration key.") + else: + raise CommandException("No input files for dbSNP index must be specified using the -d option or the dbsnp_files configuration key.") elif args.list_dbSNP_files: raise CommandException("The dbSNP Index file must be specified using the configuration parameter dbSNP_index.") From 610cd5232ff21a325bc81c21a1bf46c73e8e45d6 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 14 Feb 2020 08:44:03 +0100 Subject: [PATCH 45/61] Pull new bs_call version with selected SNP flags in dbSNP index --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index a99f470f..8b3d6728 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit a99f470f1ec7e65252247a8f39b64ede51dc53ae +Subproject commit 8b3d6728e92bdfae8839c57653b534368cb949ca From e88d6710c0cc9964ce55f410b54ea1259660e46c Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 14 Feb 2020 09:06:25 +0100 Subject: [PATCH 46/61] Copy from latest version in bs_call --- tools/utils/common/dbSNP.c | 33 ++++++++++++++++++--------------- tools/utils/common/dbSNP.h | 3 ++- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/tools/utils/common/dbSNP.c b/tools/utils/common/dbSNP.c index 6f70a208..99f09d8a 100644 --- a/tools/utils/common/dbSNP.c +++ b/tools/utils/common/dbSNP.c @@ -14,16 +14,13 @@ #include "uthash.h" #include "dbSNP.h" -static void store_dbsnp_entries(dbsnp_bin_t *bin, int n_entries, int name_buf_sz, uint16_t *entries, uint8_t *name_buf) { +static void store_dbsnp_entries(dbsnp_bin_t *bin, int n_entries, int name_buf_sz, uint16_t *entries, uint8_t *name_buf, const uint64_t mask[2]) { bin->entries = malloc(sizeof(uint16_t) * n_entries); bin->name_buf = malloc((size_t)name_buf_sz); bin->n_entries = n_entries; - uint64_t msk = (uint64_t)0; - for(int i = 0; i < n_entries; i++) { - bin->entries[i] = entries[i]; - msk |= ((uint64_t)1 << (entries[i] & 63)); - } - bin->mask = msk; + for(int i = 0; i < n_entries; i++) bin->entries[i] = entries[i]; + bin->mask = mask[0]; + bin->fq_mask = mask[1]; memcpy(bin->name_buf, name_buf, name_buf_sz); } @@ -217,6 +214,7 @@ bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg) { int n_entries = 0, name_buf_ptr = 0; bool end_of_bin = false; int prev_ix = -1; + uint64_t mask[2] = {0, 0}; while(ok && bp < bp_end) { if(!n_entries) { uint32_t bin_inc = 0; @@ -269,23 +267,28 @@ bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg) { else { prev_ix = x & 63; int k = name_buf_ptr; - while((*bp) > 1 && sl++ < 256 && bp < bp_end) { + while((*bp) > 3 && sl++ < 256 && bp < bp_end) { name_buf[name_buf_ptr++] = db_tab[(int)(*bp++)]; } k = name_buf_ptr - k; - if(*bp > 1) ok = false; + if(*bp > 3) ok = false; else { - if(*bp++ == 1) end_of_bin = true; + const uint64_t msk = (uint64_t)1 << prev_ix; + mask[0] |= msk; + uint8_t tm = *bp++; + if(tm & 2) mask[1] |= msk; + if(tm & 1) end_of_bin = true; if(n_entries == 64) ok = false; else entries[n_entries++] = (k << 8) | (uint16_t)x; } } if(!ok) break; if(end_of_bin) { - store_dbsnp_entries(bins, n_entries, name_buf_ptr, entries, name_buf); + store_dbsnp_entries(bins, n_entries, name_buf_ptr, entries, name_buf, mask); n_bins++; n_snps += n_entries; n_entries = 0; + mask[0] = mask[1] = 0; name_buf_ptr = 0; prev_ix = -1; end_of_bin = false; @@ -300,9 +303,9 @@ bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg) { return ok; } -bool dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, char * const rs, size_t * const rs_len, const uint32_t x) { +uint8_t dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, char * const rs, size_t * const rs_len, const uint32_t x) { static char dtab[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0, 0, 0, 0, 0, 0 }; - bool found = false; + uint8_t res = 0; rs[0] = 0; if(ctg != NULL) { int bn = x >> 6; @@ -311,6 +314,7 @@ bool dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, int ix = x & 63; uint64_t mk = (uint64_t)1 << ix; if(b->mask & mk) { + res = (b->fq_mask & mk) ? 3 : 1; uint64_t mk1 = b->mask & (mk - (uint64_t)1); int i = 0, j = 0; while(mk1) { @@ -339,9 +343,8 @@ bool dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, } *tp = 0; if(rs_len) *rs_len = tp - rs; - found = true; } } } - return found; + return res; } diff --git a/tools/utils/common/dbSNP.h b/tools/utils/common/dbSNP.h index caf706ec..3a0d9b5d 100644 --- a/tools/utils/common/dbSNP.h +++ b/tools/utils/common/dbSNP.h @@ -13,6 +13,7 @@ typedef struct { uint64_t mask; + uint64_t fq_mask; int n_entries; uint16_t *entries; uint8_t *name_buf; @@ -40,6 +41,6 @@ typedef struct { dbsnp_header_t * load_dbSNP_header(char * const filename); bool load_dbSNP_ctg(const dbsnp_header_t * const hdr, dbsnp_ctg_t * const ctg); void unload_dbSNP_ctg(dbsnp_ctg_t * const ctg); -bool dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, char * const rs, size_t * const rs_len, const uint32_t x); +uint8_t dbSNP_lookup_name(const dbsnp_header_t *const hdr, const dbsnp_ctg_t * ctg, char * const rs, size_t * const rs_len, const uint32_t x); #endif /* INCLUDE_DBSNP_H_ */ From a9893e05c95ff552b7a7b8bc876e6d3b071a4f59 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 14 Feb 2020 15:37:11 +0100 Subject: [PATCH 47/61] Bring in bugfix version of bs_call and bcftools --- tools/Makefile | 2 +- tools/bs_call | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index 3c81fadb..cfa96257 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -11,7 +11,7 @@ ROOT_PATH=$(CURDIR) # samtools and bcftools definitions SAMTOOLS_VERSION=1.10 -BCFTOOLS_VERSION=1.10 +BCFTOOLS_VERSION=1.10.2 SAMTOOLS_DIR=samtools BCFTOOLS_DIR=bcftools SAMTOOLS=$(SAMTOOLS_DIR)/samtools diff --git a/tools/bs_call b/tools/bs_call index 8b3d6728..34265749 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 8b3d6728e92bdfae8839c57653b534368cb949ca +Subproject commit 34265749602039d43b6f62f8e3a340ac8c65ec24 From 5c596d627e0e13b8ddae42a8cfc17317025c4ee0 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 14 Feb 2020 16:26:07 +0100 Subject: [PATCH 48/61] Switch to using persistant calculation threads --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index 34265749..62002b7f 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 34265749602039d43b6f62f8e3a340ac8c65ec24 +Subproject commit 62002b7f986f55670c91ddf0d660b92c8097137c From 471915d06869b1dff7a585ec7c595784395f967d Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 14 Feb 2020 17:57:26 +0100 Subject: [PATCH 49/61] Adjust call threads multithreading --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index 62002b7f..728748b5 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 62002b7f986f55670c91ddf0d660b92c8097137c +Subproject commit 728748b59c368be269ebbbc5ea62b4a7db17a1c1 From 1b39b964348f07d11afd6c2f95357b0019e08ba5 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Mon, 24 Feb 2020 15:17:42 +0100 Subject: [PATCH 50/61] Pull in patches for htslib v1.10.2 from the devel branch on github that fix the --write-index bug --- tools/Makefile | 31 ++++-- ...0008-Fixes-threaded-bgzf-write-index.patch | 105 ++++++++++++++++++ ...k-offsets-to-be-at-the-end-of-a-bloc.patch | 50 +++++++++ tools/patches/README | 3 + 4 files changed, 179 insertions(+), 10 deletions(-) create mode 100644 tools/patches/0008-Fixes-threaded-bgzf-write-index.patch create mode 100644 tools/patches/0009-Permit-bgzf-block-offsets-to-be-at-the-end-of-a-bloc.patch create mode 100644 tools/patches/README diff --git a/tools/Makefile b/tools/Makefile index cfa96257..ebd8f3aa 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -12,14 +12,18 @@ ROOT_PATH=$(CURDIR) # samtools and bcftools definitions SAMTOOLS_VERSION=1.10 BCFTOOLS_VERSION=1.10.2 +HTSLIB_VERSION=1.10.2 SAMTOOLS_DIR=samtools BCFTOOLS_DIR=bcftools +HTSLIB_DIR=htslib SAMTOOLS=$(SAMTOOLS_DIR)/samtools BCFTOOLS=$(BCFTOOLS_DIR)/bcftools -SAMTOOLS_TAR=samtools-$(SAMTOOLS_VERSION).tar.bz2 -BCFTOOLS_TAR=bcftools-$(BCFTOOLS_VERSION).tar.bz2 -SAMTOOLS_URL=https://github.com/samtools/samtools/releases/download/$(SAMTOOLS_VERSION)/$(SAMTOOLS_TAR) -BCFTOOLS_URL=https://github.com/samtools/bcftools/releases/download/$(BCFTOOLS_VERSION)/$(BCFTOOLS_TAR) +SAMTOOLS_TAR=samtools.tar.gz +BCFTOOLS_TAR=bcftools.tar.gz +HTSLIB_TAR=htslib.tar.gz +SAMTOOLS_URL=https://github.com/samtools/samtools/archive/$(SAMTOOLS_VERSION).tar.gz +BCFTOOLS_URL=https://github.com/samtools/bcftools/archive/$(BCFTOOLS_VERSION).tar.gz +HTSLIB_URL=https://github.com/samtools/htslib/archive/$(HTSLIB_VERSION).tar.gz MACHTYPE:=$(shell uname -m) ifneq (,$(findstring -,$(MACHTYPE))) @@ -34,10 +38,10 @@ all_static: setup gem3-static _samtools _bcftools _bs_call _utils all_debug: setup gem3-debug _samtools _bcftools _bs_call _utils -_samtools: $(SAMTOOLS_DIR) $(SAMTOOLS_DIR)/config.mk +_samtools: $(HTSLIB_DIR) $(SAMTOOLS_DIR) $(SAMTOOLS_DIR)/config.mk $(MAKE) --directory=$(SAMTOOLS_DIR) all all-htslib -_bcftools: $(BCFTOOLS_DIR) $(BCFTOOLS_DIR)/config.h +_bcftools: $(HTSLIB_DIR) $(BCFTOOLS_DIR) $(BCFTOOLS_DIR)/config.h $(MAKE) $(FOLDER_BIN)/bcftools _bs_call: bs_call/src/Makefile.mk @@ -47,13 +51,13 @@ _utils: utils/Makefile $(MAKE) --directory=utils utils/Makefile: utils/Makefile.in utils/configure _samtools - cd utils; ./configure --with-htslib=../${SAMTOOLS_DIR}/htslib-${SAMTOOLS_VERSION} + cd utils; ./configure --with-htslib=../${HTSLIB_DIR} setup: @mkdir -p $(FOLDER_BIN) bs_call/src/Makefile.mk: bs_call/src/Makefile.mk.in bs_call/gt/Makefile.mk.in bs_call/configure _samtools - cd bs_call; ./configure ${BS_CALL_CONFIG} --with-htslib=../../${SAMTOOLS_DIR}/htslib-${SAMTOOLS_VERSION} + cd bs_call; ./configure ${BS_CALL_CONFIG} --with-htslib=../../${HTSLIB_DIR} gem3: gem3-mapper/Makefile.mk $(MAKE) --directory=gem3-mapper @@ -77,13 +81,20 @@ $(BCFTOOLS_DIR)/config.h: cd $(BCFTOOLS_DIR); ./configure # --disable-lzma touch $(BCFTOOLS_DIR)/config.h +$(HTSLIB_DIR): + wget -O $(HTSLIB_TAR) $(HTSLIB_URL) && tar -zxf $(HTSLIB_TAR) && rm -f $(HTSLIB_TAR) + mv htslib-$(HTSLIB_VERSION) $(HTSLIB_DIR) + (cd $(HTSLIB_DIR) && cat ../patches/*.patch | patch && autoheader && autoconf && ./configure) + $(SAMTOOLS_DIR): - wget $(SAMTOOLS_URL) && tar -jxf $(SAMTOOLS_TAR) && rm -f $(SAMTOOLS_TAR) + wget -O $(SAMTOOLS_TAR) $(SAMTOOLS_URL) && tar -zxf $(SAMTOOLS_TAR) && rm -f $(SAMTOOLS_TAR) mv samtools-$(SAMTOOLS_VERSION) $(SAMTOOLS_DIR) + (cd $(SAMTOOLS_DIR) && autoreconf) $(BCFTOOLS_DIR): - wget $(BCFTOOLS_URL) && tar -jxf $(BCFTOOLS_TAR) && rm -f $(BCFTOOLS_TAR) + wget -O $(BCFTOOLS_TAR) $(BCFTOOLS_URL) && tar -zxf $(BCFTOOLS_TAR) && rm -f $(BCFTOOLS_TAR) mv bcftools-$(BCFTOOLS_VERSION) $(BCFTOOLS_DIR) + (cd $(BCFTOOLS_DIR) && autoheader && autoconf) clean: @rm -f *~ diff --git a/tools/patches/0008-Fixes-threaded-bgzf-write-index.patch b/tools/patches/0008-Fixes-threaded-bgzf-write-index.patch new file mode 100644 index 00000000..64cf0eda --- /dev/null +++ b/tools/patches/0008-Fixes-threaded-bgzf-write-index.patch @@ -0,0 +1,105 @@ +From e9863a0f149ea4d9e4336a061d7437952b6c7c8e Mon Sep 17 00:00:00 2001 +From: James Bonfield +Date: Wed, 19 Feb 2020 11:23:33 +0000 +Subject: [PATCH 08/10] Fixes threaded bgzf --write-index. + +This adds the analogue of the hts_idx_amend_last function for bgzf. +This is necessary when multi-threading output using --write-index. + +Fixes samtools/samtools#1197 + +In theory the change should have no impact as the only difference is +whether our virtual offset points to the end of a block or the start +of the next block. Either way the two offsets are essentially the +same locaiton on disk. However due to a bug elsewhere (see next +commit) this lead to unreported bgzf_read failures. +--- + bgzf.c | 34 ++++++++++++++++++++++++++++++++++ + hts_internal.h | 12 ++++++++++++ + sam.c | 2 ++ + 3 files changed, 48 insertions(+) + +diff --git a/bgzf.c b/bgzf.c +index 0a76676..f2e9b1e 100644 +--- a/bgzf.c ++++ b/bgzf.c +@@ -226,6 +226,40 @@ int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t e + return 0; + } + ++/* ++ * bgzf analogue to hts_idx_amend_last. ++ * ++ * This is needed when multi-threading and writing indices on the fly. ++ * At the point of writing a record we know the virtual offset for start ++ * and end, but that end virtual offset may be the end of the current ++ * block. In standard indexing our end virtual offset becomes the start ++ * of the next block. Thus to ensure bit for bit compatibility we ++ * detect this boundary case and fix it up here. ++ * ++ * In theory this has no behavioural change, but it also works around ++ * a bug elsewhere which causes bgzf_read to return 0 when our offset ++ * is the end of a block rather than the start of the next. ++ */ ++void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset) { ++ mtaux_t *mt = fp->mt; ++ if (!mt) { ++ hts_idx_amend_last(hidx, offset); ++ return; ++ } ++ ++ pthread_mutex_lock(&mt->idx_m); ++ hts_idx_cache_t *ic = &mt->idx_cache; ++ if (ic->nentries > 0) { ++ hts_idx_cache_entry *e = &ic->e[ic->nentries-1]; ++ if ((offset & 0xffff) == 0 && e->offset != 0) { ++ // bumped to next block number ++ e->offset = 0; ++ e->block_number++; ++ } ++ } ++ pthread_mutex_unlock(&mt->idx_m); ++} ++ + static int bgzf_idx_flush(BGZF *fp) { + mtaux_t *mt = fp->mt; + +diff --git a/hts_internal.h b/hts_internal.h +index dad04cb..2708123 100644 +--- a/hts_internal.h ++++ b/hts_internal.h +@@ -108,6 +108,18 @@ void close_plugin(void *plugin); + */ + int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); + ++/* ++ * bgzf analogue to hts_idx_amend_last. ++ * ++ * This is needed when multi-threading and writing indices on the fly. ++ * At the point of writing a record we know the virtual offset for start ++ * and end, but that end virtual offset may be the end of the current ++ * block. In standard indexing our end virtual offset becomes the start ++ * of the next block. Thus to ensure bit for bit compatibility we ++ * detect this boundary case and fix it up here. ++ */ ++void bgzf_idx_amend_last(BGZF *fp, hts_idx_t *hidx, uint64_t offset); ++ + #ifdef __cplusplus + } + #endif +diff --git a/sam.c b/sam.c +index ea66d25..0185b82 100644 +--- a/sam.c ++++ b/sam.c +@@ -740,6 +740,8 @@ static int bam_write_idx1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) { + return -1; + if (!bfp->mt) + hts_idx_amend_last(fp->idx, bgzf_tell(bfp)); ++ else ++ bgzf_idx_amend_last(bfp, fp->idx, bgzf_tell(bfp)); + + int ret = bam_write1(bfp, b); + if (ret < 0) +-- +2.25.1 + diff --git a/tools/patches/0009-Permit-bgzf-block-offsets-to-be-at-the-end-of-a-bloc.patch b/tools/patches/0009-Permit-bgzf-block-offsets-to-be-at-the-end-of-a-bloc.patch new file mode 100644 index 00000000..02cd211a --- /dev/null +++ b/tools/patches/0009-Permit-bgzf-block-offsets-to-be-at-the-end-of-a-bloc.patch @@ -0,0 +1,50 @@ +From 3868315dc8b6102a8de6cf44d7eae4b1a9aa40e7 Mon Sep 17 00:00:00 2001 +From: James Bonfield +Date: Wed, 19 Feb 2020 12:30:42 +0000 +Subject: [PATCH 09/10] Permit bgzf block offsets to be at the end of a block. + +This fixes a bug caused by indices generated with +"samtools view --write-index -@8" (fixed in previous commit). This +was leaving some block offsets at the end of the current block instead +of the start of the next block. + +However bgzf_read treated such scenarios as a truncated read of length +0, which the calling code then interpreted as EOF. + +Now at the exact end of a block is valid and triggers reading the next +block, but beyond the end of a block is treated as an erroneous block +offset. +--- + bgzf.c | 16 +++++++++++++++- + 1 file changed, 15 insertions(+), 1 deletion(-) + +diff --git a/bgzf.c b/bgzf.c +index f2e9b1e..184cf89 100644 +--- a/bgzf.c ++++ b/bgzf.c +@@ -1155,7 +1155,21 @@ ssize_t bgzf_read(BGZF *fp, void *data, size_t length) + return -1; + } + available = fp->block_length - fp->block_offset; +- if (available <= 0) break; ++ if (available == 0) { ++ if (fp->block_length == 0) ++ break; // EOF ++ ++ // Offset was at end of block (see commit e9863a0) ++ fp->block_address = bgzf_htell(fp); ++ fp->block_offset = fp->block_length = 0; ++ continue; ++ } else if (available < 0) { ++ // Block offset was set to an invalid coordinate ++ hts_log_error("BGZF block offset %d set beyond block size %d", ++ fp->block_offset, fp->block_length); ++ fp->errcode |= BGZF_ERR_MISUSE; ++ return -1; ++ } + } + copy_length = length - bytes_read < available? length - bytes_read : available; + buffer = (uint8_t*)fp->uncompressed_block; +-- +2.25.1 + diff --git a/tools/patches/README b/tools/patches/README new file mode 100644 index 00000000..29cde4f0 --- /dev/null +++ b/tools/patches/README @@ -0,0 +1,3 @@ +These patches are from the devel branch of htslib in github and +correct a bug in htslib that affected the --write-index option in +samtools 1.10 and could cause contigs to be skipped from the index. From 2f621b162daa6a74960ade42797f80eca945e8d3 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Tue, 25 Feb 2020 07:40:52 +0100 Subject: [PATCH 51/61] Remove --reference option when creating CRAM files in benchmark mode --- gemBS/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index ea73f620..5c16cce6 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -799,7 +799,9 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp if benchmark_mode: bamSort.append("--no-PG") if outfile.endswith('.cram'): - bamSort.extend(['-O', 'CRAM', '--reference', greference ]); + bamSort.extend(['-O', 'CRAM']); + if not benchmark_mode: + bamSort.extend(['--reference', greference ]); bamSort.append('-'); tools = [mapping,readNameClean,bamSort] @@ -842,7 +844,9 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc if benchmark_mode: bammerging.append("--no-PG") if bam_filename.endswith('.cram'): - bammerging.extend(['-O', 'CRAM', '--reference', greference]); + bammerging.extend(['-O', 'CRAM']); + if not benchmark_mode: + bamSort.extend(['--reference', greference ]); bammerging.extend(["-f",bam_filename]) for bamFile in inputs: bammerging.append(bamFile) From 89d21617dd11b51c3e19dc01faffdbc41d74f188 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 26 Feb 2020 08:32:51 +0100 Subject: [PATCH 52/61] Add missing benchmark-mode argument for bam and bcf merge commands --- gemBS/production.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gemBS/production.py b/gemBS/production.py index 286b690e..3d9fc41a 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -773,6 +773,7 @@ def register(self,parser): parser.add_argument('--dry-run', dest="dry_run", action="store_true", help="Output mapping commands without execution") parser.add_argument('--json', dest="dry_run_json",metavar="JSON FILE",help="Output JSON file with details of pending commands") parser.add_argument('--ignore-db', dest="ignore_db", action="store_true",help="Ignore database for --dry-run and --json commands") + parser.add_argument('--benchmark-mode', dest="benchmark_mode", action="store_true",help="Omit dates etc. to make file comparison simpler", required=False) def run(self, args): self.command = 'merge-bams' @@ -782,6 +783,7 @@ def run(self, args): self.threads = self.jsonData.check(section='mapping',key='threads',arg=args.threads,default='1') self.merge_threads = self.jsonData.check(section='mapping',key='merge_threads',arg=args.threads,default=self.threads) self.remove = self.jsonData.check(section='mapping',key='remove_individual_bams',arg=args.remove, boolean=True) + self.benchmark_mode = self.jsonData.check(section='mapping',key='benchmark_mode',arg=args.benchmark_mode, boolean=True) self.dry_run = args.dry_run self.dry_run_json = args.dry_run_json if self.dry_run or self.dry_run_json: @@ -1265,6 +1267,7 @@ def register(self,parser): parser.add_argument('--json', dest="dry_run_json",metavar="JSON FILE",help="Output JSON file with details of pending commands") parser.add_argument('--ignore-db', dest="ignore_db", action="store_true",help="Ignore database for --dry-run and --json commands") parser.add_argument('--ignore-dep', dest="ignore_dep", action="store_true",help="Ignore dependencies for --dry-run and --json commands") + parser.add_argument('--benchmark-mode', dest="benchmark_mode", action="store_true",help="Omit dates etc. to make file comparison simpler", required=False) def run(self,args): From cc5df500e804e1726eb112e47b7fe332cbf7240d Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 26 Feb 2020 08:47:26 +0100 Subject: [PATCH 53/61] Turn off multi-threading for CRAM generation in benchmark mode --- gemBS/__init__.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index 5c16cce6..c72f24d9 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -793,7 +793,7 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp readNameClean = [executables['readNameClean'], contig_md5] #BAM SORT - bamSort = [executables['samtools'],"sort","-T",os.path.join(tmpDir,name),"-@",sort_threads,"-m",sort_memory,"-o",outfile] + bamSort = [executables['samtools'],"sort","-T",os.path.join(tmpDir,name),"-m",sort_memory,"-o",outfile] if filetype == 'SINGLE_BAM': bamSort.append("--write-index") if benchmark_mode: @@ -801,7 +801,10 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp if outfile.endswith('.cram'): bamSort.extend(['-O', 'CRAM']); if not benchmark_mode: - bamSort.extend(['--reference', greference ]); + bamSort.extend(['--reference', greference, "-@", sort_threads]); + else: + bamSort.extend(["-@", sort_threads]); + bamSort.append('-'); tools = [mapping,readNameClean,bamSort] @@ -840,13 +843,16 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc return_info = [] if inputs: - bammerging.extend([executables['samtools'],"merge","--threads",threads,"--write-index"]) + bammerging.extend([executables['samtools'],"merge","--write-index"]) if benchmark_mode: bammerging.append("--no-PG") if bam_filename.endswith('.cram'): bammerging.extend(['-O', 'CRAM']); if not benchmark_mode: - bamSort.extend(['--reference', greference ]); + bamSort.extend(['--reference', greference, '--threads', threads]); + else: + bamSort.extend(['--threads', threads]); + bammerging.extend(["-f",bam_filename]) for bamFile in inputs: bammerging.append(bamFile) From 0724ced33f122376d22c6ab9ef6d5aad548f5595 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 26 Feb 2020 13:13:11 +0100 Subject: [PATCH 54/61] Add pthread_mut_broadcast() calls to prevent occasional hangs during cleanup --- tools/bs_call | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bs_call b/tools/bs_call index 728748b5..c172df48 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 728748b59c368be269ebbbc5ea62b4a7db17a1c1 +Subproject commit c172df48590c360f218c0fe958ff64ae5e92a5ec From f2522523259e3a83270a9c2d9e3acb7bde0b062f Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 26 Feb 2020 13:52:06 +0100 Subject: [PATCH 55/61] Update recipes to reflect new build process --- Dockerfile | 3 +-- IHEC/Singularity.ihec | 2 +- Singularity | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index d37bb668..34b7e498 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,7 @@ FROM ubuntu:xenial MAINTAINER Simon Heath (simon.heath@gmail.com) RUN apt-get update -RUN apt-get install -y libpng-dev uuid-dev libmysqlclient-dev -RUN apt-get install -y python3 build-essential git python3-pip wget pigz +RUN apt-get install -y python3 build-essential git autoconf python3-pip wget lbzip2 RUN apt-get install -y zlib1g-dev libbz2-dev gsl-bin libgsl0-dev RUN apt-get install -y libncurses5-dev liblzma-dev libssl-dev libcurl4-openssl-dev RUN pip3 install 'matplotlib<3.0' multiprocess diff --git a/IHEC/Singularity.ihec b/IHEC/Singularity.ihec index e6fb8e46..15f973c4 100644 --- a/IHEC/Singularity.ihec +++ b/IHEC/Singularity.ihec @@ -10,7 +10,7 @@ From: ubuntu:16.04 %post (mkdir /ext && cd /ext && mkdir disk1 disk2 disk3 disk4 disk5 disk6 disk7 disk8 disk9) apt-get update - apt-get install -y python3 build-essential git python3-pip wget lbzip2 + apt-get install -y python3 build-essential git autoconf python3-pip wget lbzip2 apt-get install -y zlib1g-dev libbz2-dev gsl-bin libgsl0-dev apt-get install -y libncurses5-dev liblzma-dev libssl-dev libcurl4-openssl-dev pip3 install 'matplotlib<3.0' multiprocess diff --git a/Singularity b/Singularity index e6fb8e46..15f973c4 100644 --- a/Singularity +++ b/Singularity @@ -10,7 +10,7 @@ From: ubuntu:16.04 %post (mkdir /ext && cd /ext && mkdir disk1 disk2 disk3 disk4 disk5 disk6 disk7 disk8 disk9) apt-get update - apt-get install -y python3 build-essential git python3-pip wget lbzip2 + apt-get install -y python3 build-essential git autoconf python3-pip wget lbzip2 apt-get install -y zlib1g-dev libbz2-dev gsl-bin libgsl0-dev apt-get install -y libncurses5-dev liblzma-dev libssl-dev libcurl4-openssl-dev pip3 install 'matplotlib<3.0' multiprocess From 03c7997170e126f3a99c8aa7c62bc5005ac8d8f4 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 26 Feb 2020 16:14:01 +0100 Subject: [PATCH 56/61] Fix typo --- gemBS/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index c72f24d9..e8ed7858 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -849,9 +849,9 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc if bam_filename.endswith('.cram'): bammerging.extend(['-O', 'CRAM']); if not benchmark_mode: - bamSort.extend(['--reference', greference, '--threads', threads]); + bammerging.extend(['--reference', greference, '--threads', threads]); else: - bamSort.extend(['--threads', threads]); + bammerging.extend(['--threads', threads]); bammerging.extend(["-f",bam_filename]) for bamFile in inputs: From 0f141a855ce70317a7c8cb587f469e02186d0131 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Tue, 17 Mar 2020 17:36:35 +0100 Subject: [PATCH 57/61] Singularity Hub does not seem to work with Library:, switch back to using Docker bootstrap --- IHEC/Singularity.ihec | 4 ++-- Singularity | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/IHEC/Singularity.ihec b/IHEC/Singularity.ihec index 15f973c4..dbf29ad4 100644 --- a/IHEC/Singularity.ihec +++ b/IHEC/Singularity.ihec @@ -1,5 +1,5 @@ -BootStrap: library -From: ubuntu:16.04 +BootStrap: docker +From: ubuntu:xenial %runscript exec /usr/local/bin/gemBS $@ diff --git a/Singularity b/Singularity index 15f973c4..dbf29ad4 100644 --- a/Singularity +++ b/Singularity @@ -1,5 +1,5 @@ -BootStrap: library -From: ubuntu:16.04 +BootStrap: docker +From: ubuntu:xenial %runscript exec /usr/local/bin/gemBS $@ From 8b33580390dd0f975dbb0f7e3f72ed34ce7b17c1 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Wed, 18 Mar 2020 12:35:53 +0100 Subject: [PATCH 58/61] Switch back to using multiple threads for CRAM generation in benchmark mode, as even with a single thread the file sizes can differ --- gemBS/__init__.py | 8 ++++---- gemBS/production.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/gemBS/__init__.py b/gemBS/__init__.py index e8ed7858..3e41753e 100644 --- a/gemBS/__init__.py +++ b/gemBS/__init__.py @@ -799,9 +799,9 @@ def mapping(name=None,index=None,fliInfo=None,inputFiles=None,ftype=None,filetyp if benchmark_mode: bamSort.append("--no-PG") if outfile.endswith('.cram'): - bamSort.extend(['-O', 'CRAM']); + bamSort.extend(['-O', 'CRAM', "-@", sort_threads]); if not benchmark_mode: - bamSort.extend(['--reference', greference, "-@", sort_threads]); + bamSort.extend(['--reference', greference]); else: bamSort.extend(["-@", sort_threads]); @@ -847,9 +847,9 @@ def merging(inputs=None,sample=None,threads="1",outname=None,tmpDir="/tmp/",benc if benchmark_mode: bammerging.append("--no-PG") if bam_filename.endswith('.cram'): - bammerging.extend(['-O', 'CRAM']); + bammerging.extend(['-O', 'CRAM', '--threads', threads]); if not benchmark_mode: - bammerging.extend(['--reference', greference, '--threads', threads]); + bammerging.extend(['--reference', greference]); else: bammerging.extend(['--threads', threads]); diff --git a/gemBS/production.py b/gemBS/production.py index 3d9fc41a..6e922fec 100644 --- a/gemBS/production.py +++ b/gemBS/production.py @@ -989,7 +989,7 @@ def run(self,args): if len(self.contig_list) == 1: if os.path.isfile(self.contig_list[0]): #Check if contig_list is a file or just a list of chromosomes - #Parse file to extract chromosme list + #Parse file to extract chromosome list tmp_list = [] with open(self.contig_list[0] , 'r') as chromFile: for line in chromFile: @@ -1360,7 +1360,7 @@ def register(self,parser): parser.add_argument('-N','--non-cpg', dest="non_cpg", action="store_true", help="Output gemBS bed with non-cpg sites.") parser.add_argument('-B','--bed-methyl', dest="bedMethyl", action="store_true", help="Output bedMethyl files (bed and bigBed)") parser.add_argument('-S','--snps', dest="snps", action="store_true",help="Output SNPs") - parser.add_argument('--extract-threads', dest="extract_threads", metavar="THREADS", help='Number of extra threads for extract step') + parser.add_argument('-t','--extract-threads', dest="extract_threads", metavar="THREADS", help='Number of extra threads for extract step') parser.add_argument('--snp-list', dest="snp_list", help="List of SNPs to output") parser.add_argument('--snp-db', dest="snp_db", help="dbSNP_idx processed SNP idx") parser.add_argument('--dry-run', dest="dry_run", action="store_true", help="Output mapping commands without execution") From 522d41a6624b97ea406962fb236523ad4ba77275 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Fri, 20 Mar 2020 19:31:18 +0100 Subject: [PATCH 59/61] Update installation path for htslib in setup.py --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b095d25d..1367c1e0 100644 --- a/setup.py +++ b/setup.py @@ -111,7 +111,7 @@ def _install_bundle(install_dir, inst): # print ("Copy binary: samtools to {}".format(bin_dir)) shutil.copy("tools/samtools/samtools", bin_dir) os.chmod(os.path.join(bin_dir, "samtools"), 0o755) - for htslib in glob.glob("tools/samtools/htslib*"): + for htslib in glob.glob("tools/htslib"): if os.path.isdir(htslib): for file in ["htsfile", "tabix", "bgzip"]: if os.path.exists(os.path.join(htslib,file)): From 0cdae339aaa3cbfc9b40521a1436eae933fee641 Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 21 Mar 2020 10:03:15 +0100 Subject: [PATCH 60/61] Check if --std=c99 is required to make compiler standards conformant --- tools/Makefile | 1 + tools/bs_call | 2 +- tools/utils/Makefile.in | 4 +- tools/utils/configure | 2917 +++++++++++++++++++++++++++----------- tools/utils/configure.ac | 1 + 5 files changed, 2114 insertions(+), 811 deletions(-) diff --git a/tools/Makefile b/tools/Makefile index ebd8f3aa..e911d8ba 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -110,6 +110,7 @@ distclean: @rm -rf $(FOLDER_BIN) if [ -f $(SAMTOOLS_DIR)/Makefile ]; then cd $(SAMTOOLS_DIR); $(MAKE) clean; rm -f config.h config.log config.status config.mk; fi if [ -f $(BCFTOOLS_DIR)/Makefile ]; then cd $(BCFTOOLS_DIR); $(MAKE) clean; rm -f config.h config.log config.status config.mk; fi + if [ -f $(HTSLIB_DIR)/Makefile ]; then cd $(HTSLIB_DIR); $(MAKE) clean; rm -f config.h config.log config.status config.mk; fi if [ -f "gem3-mapper/Makefile.mk" ]; then $(MAKE) --directory=gem3-mapper distclean; fi if [ -f "bs_call/src/Makefile.mk" ]; then $(MAKE) --directory=bs_call distclean; fi if [ -f "utils/Makefile" ]; then $(MAKE) --directory=utils distclean; fi diff --git a/tools/bs_call b/tools/bs_call index c172df48..8592d85a 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit c172df48590c360f218c0fe958ff64ae5e92a5ec +Subproject commit 8592d85a2fe58ec58f59c630728153694c19bbc4 diff --git a/tools/utils/Makefile.in b/tools/utils/Makefile.in index 425c86b2..6904cfd3 100644 --- a/tools/utils/Makefile.in +++ b/tools/utils/Makefile.in @@ -9,14 +9,14 @@ # Definitions -CC=gcc +CC=@CC@ ROOT_PATH=.. TOOLS=gemBS_cat readNameClean md5_fasta mextr snpxtr FOLDER_BIN=../bin TOOLS_BIN=$(addprefix $(FOLDER_BIN)/, $(TOOLS)) -LIBS:= -lm +LIBS:= $(LDFLAGS) -lm GENERAL_FLAGS = -I. -Icommon MEXTR_INC = @HTSINC@ -Imextr diff --git a/tools/utils/configure b/tools/utils/configure index 12aced1d..dc099b87 100755 --- a/tools/utils/configure +++ b/tools/utils/configure @@ -1,20 +1,18 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for gemBS_utils 1.0. -# -# -# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. -# +# Generated by GNU Autoconf 2.63 for gemBS_utils 1.0. # +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, +# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. # This configure script is free software; the Free Software Foundation # gives unlimited permission to copy, distribute and modify it. -## -------------------- ## -## M4sh Initialization. ## -## -------------------- ## +## --------------------- ## +## M4sh Initialization. ## +## --------------------- ## # Be more Bourne compatible DUALCASE=1; export DUALCASE # for MKS sh -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which @@ -22,15 +20,23 @@ if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : alias -g '${1+"$@"}'='"$@"' setopt NO_GLOB_SUBST else - case `(set -o) 2>/dev/null` in #( - *posix*) : - set -o posix ;; #( - *) : - ;; + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; esac + fi + + +# PATH needs CR +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + as_nl=' ' export as_nl @@ -38,13 +44,7 @@ export as_nl as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo -# Prefer a ksh shell builtin over an external printf program on Solaris, -# but without wasting forks for bash or zsh. -if test -z "$BASH_VERSION$ZSH_VERSION" \ - && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='print -r --' - as_echo_n='print -rn --' -elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then +if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then as_echo='printf %s\n' as_echo_n='printf %s' else @@ -55,7 +55,7 @@ else as_echo_body='eval expr "X$1" : "X\\(.*\\)"' as_echo_n_body='eval arg=$1; - case $arg in #( + case $arg in *"$as_nl"*) expr "X$arg" : "X\\(.*\\)$as_nl"; arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; @@ -78,6 +78,13 @@ if test "${PATH_SEPARATOR+set}" != set; then } fi +# Support unset when possible. +if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then + as_unset=unset +else + as_unset=false +fi + # IFS # We need space, tab and new line, in precisely that order. Quoting is @@ -87,16 +94,15 @@ fi IFS=" "" $as_nl" # Find who we are. Look in the path if we contain no directory separator. -as_myself= -case $0 in #(( +case $0 in *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. - test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break - done + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break +done IFS=$as_save_IFS ;; @@ -108,16 +114,12 @@ if test "x$as_myself" = x; then fi if test ! -f "$as_myself"; then $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 - exit 1 + { (exit 1); exit 1; } fi -# Unset variables that we do not need and which cause bugs (e.g. in -# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" -# suppresses any "Segmentation fault" message there. '((' could -# trigger a bug in pdksh 5.2.14. -for as_var in BASH_ENV ENV MAIL MAILPATH -do eval test x\${$as_var+set} = xset \ - && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +# Work around bugs in pre-3.0 UWIN ksh. +for as_var in ENV MAIL MAILPATH +do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var done PS1='$ ' PS2='> ' @@ -129,343 +131,330 @@ export LC_ALL LANGUAGE=C export LANGUAGE +# Required to use basename. +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + + +# Name of the executable. +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + # CDPATH. -(unset CDPATH) >/dev/null 2>&1 && unset CDPATH - -# Use a proper internal environment variable to ensure we don't fall - # into an infinite loop, continuously re-executing ourselves. - if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then - _as_can_reexec=no; export _as_can_reexec; - # We cannot yet assume a decent shell, so we have to provide a -# neutralization value for shells without unset; and this also -# works around shells that cannot unset nonexistent variables. -# Preserve -v and -x to the replacement shell. -BASH_ENV=/dev/null -ENV=/dev/null -(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV -case $- in # (((( - *v*x* | *x*v* ) as_opts=-vx ;; - *v* ) as_opts=-v ;; - *x* ) as_opts=-x ;; - * ) as_opts= ;; -esac -exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} -# Admittedly, this is quite paranoid, since all the known shells bail -# out after a failed `exec'. -$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 -as_fn_exit 255 - fi - # We don't want this to propagate to other subprocesses. - { _as_can_reexec=; unset _as_can_reexec;} +$as_unset CDPATH + + if test "x$CONFIG_SHELL" = x; then - as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : - emulate sh - NULLCMD=: - # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which - # is contrary to our usage. Disable this feature. - alias -g '\${1+\"\$@\"}'='\"\$@\"' - setopt NO_GLOB_SUBST + if (eval ":") 2>/dev/null; then + as_have_required=yes else - case \`(set -o) 2>/dev/null\` in #( - *posix*) : - set -o posix ;; #( - *) : - ;; -esac + as_have_required=no fi -" - as_required="as_fn_return () { (exit \$1); } -as_fn_success () { as_fn_return 0; } -as_fn_failure () { as_fn_return 1; } -as_fn_ret_success () { return 0; } -as_fn_ret_failure () { return 1; } + + if test $as_have_required = yes && (eval ": +(as_func_return () { + (exit \$1) +} +as_func_success () { + as_func_return 0 +} +as_func_failure () { + as_func_return 1 +} +as_func_ret_success () { + return 0 +} +as_func_ret_failure () { + return 1 +} exitcode=0 -as_fn_success || { exitcode=1; echo as_fn_success failed.; } -as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; } -as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; } -as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; } -if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : - -else - exitcode=1; echo positional parameters were not saved. -fi -test x\$exitcode = x0 || exit 1 -test -x / || exit 1" - as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO - as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO - eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && - test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1" - if (eval "$as_required") 2>/dev/null; then : - as_have_required=yes +if as_func_success; then + : else - as_have_required=no + exitcode=1 + echo as_func_success failed. +fi + +if as_func_failure; then + exitcode=1 + echo as_func_failure succeeded. +fi + +if as_func_ret_success; then + : +else + exitcode=1 + echo as_func_ret_success failed. fi - if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then : +if as_func_ret_failure; then + exitcode=1 + echo as_func_ret_failure succeeded. +fi + +if ( set x; as_func_ret_success y && test x = \"\$1\" ); then + : +else + exitcode=1 + echo positional parameters were not saved. +fi + +test \$exitcode = 0) || { (exit 1); exit 1; } + +( + as_lineno_1=\$LINENO + as_lineno_2=\$LINENO + test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" && + test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; } +") 2> /dev/null; then + : else - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -as_found=false + as_candidate_shells= + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. - as_found=: - case $as_dir in #( + case $as_dir in /*) for as_base in sh bash ksh sh5; do - # Try only shells that exist, to save several forks. - as_shell=$as_dir/$as_base - if { test -f "$as_shell" || test -f "$as_shell.exe"; } && - { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then : - CONFIG_SHELL=$as_shell as_have_required=yes - if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then : - break 2 -fi -fi + as_candidate_shells="$as_candidate_shells $as_dir/$as_base" done;; esac - as_found=false done -$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } && - { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then : - CONFIG_SHELL=$SHELL as_have_required=yes -fi; } IFS=$as_save_IFS - if test "x$CONFIG_SHELL" != x; then : - export CONFIG_SHELL - # We cannot yet assume a decent shell, so we have to provide a -# neutralization value for shells without unset; and this also -# works around shells that cannot unset nonexistent variables. -# Preserve -v and -x to the replacement shell. -BASH_ENV=/dev/null -ENV=/dev/null -(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV -case $- in # (((( - *v*x* | *x*v* ) as_opts=-vx ;; - *v* ) as_opts=-v ;; - *x* ) as_opts=-x ;; - * ) as_opts= ;; + for as_shell in $as_candidate_shells $SHELL; do + # Try only shells that exist, to save several forks. + if { test -f "$as_shell" || test -f "$as_shell.exe"; } && + { ("$as_shell") 2> /dev/null <<\_ASEOF +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; esac -exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} -# Admittedly, this is quite paranoid, since all the known shells bail -# out after a failed `exec'. -$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 -exit 255 -fi - - if test x$as_have_required = xno; then : - $as_echo "$0: This script requires a shell more modern than all" - $as_echo "$0: the shells that I found on your system." - if test x${ZSH_VERSION+set} = xset ; then - $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should" - $as_echo "$0: be upgraded to zsh 4.3.4 or later." - else - $as_echo "$0: Please tell bug-autoconf@gnu.org about your system, -$0: including any error possibly output before this -$0: message. Then install a modern shell, or manually run -$0: the script under such a shell if you do have one." - fi - exit 1 -fi -fi + fi -SHELL=${CONFIG_SHELL-/bin/sh} -export SHELL -# Unset more variables known to interfere with behavior of common tools. -CLICOLOR_FORCE= GREP_OPTIONS= -unset CLICOLOR_FORCE GREP_OPTIONS -## --------------------- ## -## M4sh Shell Functions. ## -## --------------------- ## -# as_fn_unset VAR -# --------------- -# Portably unset VAR. -as_fn_unset () -{ - { eval $1=; unset $1;} -} -as_unset=as_fn_unset -# as_fn_set_status STATUS -# ----------------------- -# Set $? to STATUS, without forking. -as_fn_set_status () -{ - return $1 -} # as_fn_set_status +: +_ASEOF +}; then + CONFIG_SHELL=$as_shell + as_have_required=yes + if { "$as_shell" 2> /dev/null <<\_ASEOF +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; +esac -# as_fn_exit STATUS -# ----------------- -# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. -as_fn_exit () -{ - set +e - as_fn_set_status $1 - exit $1 -} # as_fn_exit - -# as_fn_mkdir_p -# ------------- -# Create "$as_dir" as a directory, including parents if necessary. -as_fn_mkdir_p () -{ +fi - case $as_dir in #( - -*) as_dir=./$as_dir;; - esac - test -d "$as_dir" || eval $as_mkdir_p || { - as_dirs= - while :; do - case $as_dir in #( - *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( - *) as_qdir=$as_dir;; - esac - as_dirs="'$as_qdir' $as_dirs" - as_dir=`$as_dirname -- "$as_dir" || -$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$as_dir" : 'X\(//\)[^/]' \| \ - X"$as_dir" : 'X\(//\)$' \| \ - X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$as_dir" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ - s//\1/ - q - } - /^X\(\/\/\)$/{ - s//\1/ - q - } - /^X\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - test -d "$as_dir" && break - done - test -z "$as_dirs" || eval "mkdir $as_dirs" - } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" +: +(as_func_return () { + (exit $1) +} +as_func_success () { + as_func_return 0 +} +as_func_failure () { + as_func_return 1 +} +as_func_ret_success () { + return 0 +} +as_func_ret_failure () { + return 1 +} + +exitcode=0 +if as_func_success; then + : +else + exitcode=1 + echo as_func_success failed. +fi -} # as_fn_mkdir_p +if as_func_failure; then + exitcode=1 + echo as_func_failure succeeded. +fi -# as_fn_executable_p FILE -# ----------------------- -# Test if FILE is an executable regular file. -as_fn_executable_p () -{ - test -f "$1" && test -x "$1" -} # as_fn_executable_p -# as_fn_append VAR VALUE -# ---------------------- -# Append the text in VALUE to the end of the definition contained in VAR. Take -# advantage of any shell optimizations that allow amortized linear growth over -# repeated appends, instead of the typical quadratic growth present in naive -# implementations. -if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : - eval 'as_fn_append () - { - eval $1+=\$2 - }' +if as_func_ret_success; then + : else - as_fn_append () - { - eval $1=\$$1\$2 - } -fi # as_fn_append - -# as_fn_arith ARG... -# ------------------ -# Perform arithmetic evaluation on the ARGs, and store the result in the -# global $as_val. Take advantage of shells that can avoid forks. The arguments -# must be portable across $(()) and expr. -if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : - eval 'as_fn_arith () - { - as_val=$(( $* )) - }' + exitcode=1 + echo as_func_ret_success failed. +fi + +if as_func_ret_failure; then + exitcode=1 + echo as_func_ret_failure succeeded. +fi + +if ( set x; as_func_ret_success y && test x = "$1" ); then + : else - as_fn_arith () - { - as_val=`expr "$@" || test $? -eq 1` - } -fi # as_fn_arith + exitcode=1 + echo positional parameters were not saved. +fi +test $exitcode = 0) || { (exit 1); exit 1; } -# as_fn_error STATUS ERROR [LINENO LOG_FD] -# ---------------------------------------- -# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are -# provided, also output the error to LOG_FD, referencing LINENO. Then exit the -# script with STATUS, using 1 if that was 0. -as_fn_error () -{ - as_status=$1; test $as_status -eq 0 && as_status=1 - if test "$4"; then - as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 - fi - $as_echo "$as_me: error: $2" >&2 - as_fn_exit $as_status -} # as_fn_error +( + as_lineno_1=$LINENO + as_lineno_2=$LINENO + test "x$as_lineno_1" != "x$as_lineno_2" && + test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; } -if expr a : '\(a\)' >/dev/null 2>&1 && - test "X`expr 00001 : '.*\(...\)'`" = X001; then - as_expr=expr +_ASEOF +}; then + break +fi + +fi + + done + + if test "x$CONFIG_SHELL" != x; then + for as_var in BASH_ENV ENV + do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var + done + export CONFIG_SHELL + exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"} +fi + + + if test $as_have_required = no; then + echo This script requires a shell more modern than all the + echo shells that I found on your system. Please install a + echo modern shell, or manually run the script under such a + echo shell if you do have one. + { (exit 1); exit 1; } +fi + + +fi + +fi + + + +(eval "as_func_return () { + (exit \$1) +} +as_func_success () { + as_func_return 0 +} +as_func_failure () { + as_func_return 1 +} +as_func_ret_success () { + return 0 +} +as_func_ret_failure () { + return 1 +} + +exitcode=0 +if as_func_success; then + : else - as_expr=false + exitcode=1 + echo as_func_success failed. fi -if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then - as_basename=basename +if as_func_failure; then + exitcode=1 + echo as_func_failure succeeded. +fi + +if as_func_ret_success; then + : else - as_basename=false + exitcode=1 + echo as_func_ret_success failed. fi -if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then - as_dirname=dirname +if as_func_ret_failure; then + exitcode=1 + echo as_func_ret_failure succeeded. +fi + +if ( set x; as_func_ret_success y && test x = \"\$1\" ); then + : else - as_dirname=false + exitcode=1 + echo positional parameters were not saved. fi -as_me=`$as_basename -- "$0" || -$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ - X"$0" : 'X\(//\)$' \| \ - X"$0" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X/"$0" | - sed '/^.*\/\([^/][^/]*\)\/*$/{ - s//\1/ - q - } - /^X\/\(\/\/\)$/{ - s//\1/ - q - } - /^X\/\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` +test \$exitcode = 0") || { + echo No shell found that supports shell functions. + echo Please tell bug-autoconf@gnu.org about your system, + echo including any error possibly output before this message. + echo This can help us improve future autoconf versions. + echo Configuration will now proceed without shell functions. +} -# Avoid depending upon Character Ranges. -as_cr_letters='abcdefghijklmnopqrstuvwxyz' -as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' -as_cr_Letters=$as_cr_letters$as_cr_LETTERS -as_cr_digits='0123456789' -as_cr_alnum=$as_cr_Letters$as_cr_digits - as_lineno_1=$LINENO as_lineno_1a=$LINENO - as_lineno_2=$LINENO as_lineno_2a=$LINENO - eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" && - test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || { - # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-) + as_lineno_1=$LINENO + as_lineno_2=$LINENO + test "x$as_lineno_1" != "x$as_lineno_2" && + test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || { + + # Create $as_me.lineno as a copy of $as_myself, but with $LINENO + # uniformly replaced by the line number. The first 'sed' inserts a + # line-number line after each line using $LINENO; the second 'sed' + # does the real work. The second script uses 'N' to pair each + # line-number line with the line containing $LINENO, and appends + # trailing '-' during substitution so that $LINENO is not a special + # case at line end. + # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the + # scripts with optimization help from Paolo Bonzini. Blame Lee + # E. McMahon (1931-1989) for sed's syntax. :-) sed -n ' p /[$]LINENO/= @@ -482,12 +471,9 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits s/-\n.*// ' >$as_me.lineno && chmod +x "$as_me.lineno" || - { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 + { (exit 1); exit 1; }; } - # If we had to re-execute with $CONFIG_SHELL, we're ensured to have - # already done that, so ensure we don't try to do so again and fall - # in an infinite loop. This has already happened in practice. - _as_can_reexec=no; export _as_can_reexec # Don't try to exec as it changes $[0], causing all sort of problems # (the dirname of $[0] is not the place where we might find the # original and so on. Autoconf is especially sensitive to this). @@ -496,20 +482,31 @@ as_cr_alnum=$as_cr_Letters$as_cr_digits exit } + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + ECHO_C= ECHO_N= ECHO_T= -case `echo -n x` in #((((( +case `echo -n x` in -n*) - case `echo 'xy\c'` in + case `echo 'x\c'` in *c*) ECHO_T=' ';; # ECHO_T is single tab character. - xy) ECHO_C='\c';; - *) echo `echo ksh88 bug on AIX 6.1` > /dev/null - ECHO_T=' ';; + *) ECHO_C='\c';; esac;; *) ECHO_N='-n';; esac - -rm -f conf$$ conf$$.exe conf$$.file +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +rm -f conf$$ conf$$.exe conf$$.file if test -d conf$$.dir; then rm -f conf$$.dir/conf$$.file else @@ -522,29 +519,49 @@ if (echo >conf$$.file) 2>/dev/null; then # ... but there are two gotchas: # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. - # In both cases, we have to default to `cp -pR'. + # In both cases, we have to default to `cp -p'. ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || - as_ln_s='cp -pR' + as_ln_s='cp -p' elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else - as_ln_s='cp -pR' + as_ln_s='cp -p' fi else - as_ln_s='cp -pR' + as_ln_s='cp -p' fi rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file rmdir conf$$.dir 2>/dev/null if mkdir -p . 2>/dev/null; then - as_mkdir_p='mkdir -p "$as_dir"' + as_mkdir_p=: else test -d ./-p && rmdir ./-p as_mkdir_p=false fi -as_test_x='test -x' -as_executable_p=as_fn_executable_p +if test -x / >/dev/null 2>&1; then + as_test_x='test -x' +else + if ls -dL / >/dev/null 2>&1; then + as_ls_L_option=L + else + as_ls_L_option= + fi + as_test_x=' + eval sh -c '\'' + if test -d "$1"; then + test -d "$1/."; + else + case $1 in + -*)set "./$1";; + esac; + case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in + ???[sx]*):;;*)false;;esac;fi + '\'' sh + ' +fi +as_executable_p=$as_test_x # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" @@ -553,11 +570,11 @@ as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" -test -n "$DJDIR" || exec 7<&0 &1 + +exec 7<&0 &1 # Name of the host. -# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status, +# hostname on some systems (SVR3.2, Linux) returns a bogus exit status, # so uname gets run too. ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` @@ -572,6 +589,7 @@ cross_compiling=no subdirs= MFLAGS= MAKEFLAGS= +SHELL=${CONFIG_SHELL-/bin/sh} # Identity of this package. PACKAGE_NAME='gemBS_utils' @@ -579,12 +597,18 @@ PACKAGE_TARNAME='gembs_utils' PACKAGE_VERSION='1.0' PACKAGE_STRING='gemBS_utils 1.0' PACKAGE_BUGREPORT='' -PACKAGE_URL='' ac_subst_vars='LTLIBOBJS LIBOBJS HTSINC HTSLIBS +OBJEXT +EXEEXT +ac_ct_CC +CPPFLAGS +LDFLAGS +CFLAGS +CC target_alias host_alias build_alias @@ -615,7 +639,6 @@ bindir program_transform_name prefix exec_prefix -PACKAGE_URL PACKAGE_BUGREPORT PACKAGE_STRING PACKAGE_VERSION @@ -630,7 +653,12 @@ with_htslib ' ac_precious_vars='build_alias host_alias -target_alias' +target_alias +CC +CFLAGS +LDFLAGS +LIBS +CPPFLAGS' # Initialize some variables set by options. @@ -693,9 +721,8 @@ do fi case $ac_option in - *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; - *=) ac_optarg= ;; - *) ac_optarg=yes ;; + *=*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; + *) ac_optarg=yes ;; esac # Accept the important Cygnus configure options, so we can diagnose typos. @@ -740,7 +767,8 @@ do ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid feature name: $ac_useropt" + { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in @@ -766,7 +794,8 @@ do ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid feature name: $ac_useropt" + { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in @@ -970,7 +999,8 @@ do ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid package name: $ac_useropt" + { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in @@ -986,7 +1016,8 @@ do ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` # Reject names that are not valid shell variable names. expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - as_fn_error $? "invalid package name: $ac_useropt" + { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } ac_useropt_orig=$ac_useropt ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` case $ac_user_opts in @@ -1016,17 +1047,17 @@ do | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) x_libraries=$ac_optarg ;; - -*) as_fn_error $? "unrecognized option: \`$ac_option' -Try \`$0 --help' for more information" + -*) { $as_echo "$as_me: error: unrecognized option: $ac_option +Try \`$0 --help' for more information." >&2 + { (exit 1); exit 1; }; } ;; *=*) ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` # Reject names that are not valid shell variable names. - case $ac_envvar in #( - '' | [0-9]* | *[!_$as_cr_alnum]* ) - as_fn_error $? "invalid variable name: \`$ac_envvar'" ;; - esac + expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null && + { $as_echo "$as_me: error: invalid variable name: $ac_envvar" >&2 + { (exit 1); exit 1; }; } eval $ac_envvar=\$ac_optarg export $ac_envvar ;; @@ -1035,7 +1066,7 @@ Try \`$0 --help' for more information" $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 - : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}" + : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option} ;; esac @@ -1043,13 +1074,15 @@ done if test -n "$ac_prev"; then ac_option=--`echo $ac_prev | sed 's/_/-/g'` - as_fn_error $? "missing argument to $ac_option" + { $as_echo "$as_me: error: missing argument to $ac_option" >&2 + { (exit 1); exit 1; }; } fi if test -n "$ac_unrecognized_opts"; then case $enable_option_checking in no) ;; - fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;; + fatal) { $as_echo "$as_me: error: unrecognized options: $ac_unrecognized_opts" >&2 + { (exit 1); exit 1; }; } ;; *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; esac fi @@ -1072,7 +1105,8 @@ do [\\/$]* | ?:[\\/]* ) continue;; NONE | '' ) case $ac_var in *prefix ) continue;; esac;; esac - as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val" + { $as_echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2 + { (exit 1); exit 1; }; } done # There might be people who depend on the old broken behavior: `$host' @@ -1086,6 +1120,8 @@ target=$target_alias if test "x$host_alias" != x; then if test "x$build_alias" = x; then cross_compiling=maybe + $as_echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host. + If a cross compiler is detected then cross compile mode will be used." >&2 elif test "x$build_alias" != "x$host_alias"; then cross_compiling=yes fi @@ -1100,9 +1136,11 @@ test "$silent" = yes && exec 6>/dev/null ac_pwd=`pwd` && test -n "$ac_pwd" && ac_ls_di=`ls -di .` && ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || - as_fn_error $? "working directory cannot be determined" + { $as_echo "$as_me: error: working directory cannot be determined" >&2 + { (exit 1); exit 1; }; } test "X$ac_ls_di" = "X$ac_pwd_ls_di" || - as_fn_error $? "pwd does not report name of working directory" + { $as_echo "$as_me: error: pwd does not report name of working directory" >&2 + { (exit 1); exit 1; }; } # Find the source files, if location was not specified. @@ -1141,11 +1179,13 @@ else fi if test ! -r "$srcdir/$ac_unique_file"; then test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." - as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir" + { $as_echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2 + { (exit 1); exit 1; }; } fi ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" ac_abs_confdir=`( - cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg" + cd "$srcdir" && test -r "./$ac_unique_file" || { $as_echo "$as_me: error: $ac_msg" >&2 + { (exit 1); exit 1; }; } pwd)` # When building in place, set srcdir=. if test "$ac_abs_confdir" = "$ac_pwd"; then @@ -1185,7 +1225,7 @@ Configuration: --help=short display options specific to this package --help=recursive display the short help of all the included packages -V, --version display version information and exit - -q, --quiet, --silent do not print \`checking ...' messages + -q, --quiet, --silent do not print \`checking...' messages --cache-file=FILE cache test results in FILE [disabled] -C, --config-cache alias for \`--cache-file=config.cache' -n, --no-create do not create output files @@ -1242,7 +1282,18 @@ Optional Packages: --with-htslib=PATH specify prefix directory for installed htslib library. -Report bugs to the package provider. +Some influential environment variables: + CC C compiler command + CFLAGS C compiler flags + LDFLAGS linker flags, e.g. -L if you have libraries in a + nonstandard directory + LIBS libraries to pass to the linker, e.g. -l + CPPFLAGS C/C++/Objective C preprocessor flags, e.g. -I if + you have headers in a nonstandard directory + +Use these variables to override the choices made by `configure' or to help +it to find libraries and programs with nonstandard names/locations. + _ACEOF ac_status=$? fi @@ -1306,24 +1357,21 @@ test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF gemBS_utils configure 1.0 -generated by GNU Autoconf 2.69 +generated by GNU Autoconf 2.63 -Copyright (C) 2012 Free Software Foundation, Inc. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, +2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. This configure script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it. _ACEOF exit fi - -## ------------------------ ## -## Autoconf initialization. ## -## ------------------------ ## cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. It was created by gemBS_utils $as_me 1.0, which was -generated by GNU Autoconf 2.69. Invocation command line was +generated by GNU Autoconf 2.63. Invocation command line was $ $0 $@ @@ -1359,8 +1407,8 @@ for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. - $as_echo "PATH: $as_dir" - done + $as_echo "PATH: $as_dir" +done IFS=$as_save_IFS } >&5 @@ -1397,9 +1445,9 @@ do ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; esac case $ac_pass in - 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;; + 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;; 2) - as_fn_append ac_configure_args1 " '$ac_arg'" + ac_configure_args1="$ac_configure_args1 '$ac_arg'" if test $ac_must_keep_next = true; then ac_must_keep_next=false # Got value, back to normal. else @@ -1415,13 +1463,13 @@ do -* ) ac_must_keep_next=true ;; esac fi - as_fn_append ac_configure_args " '$ac_arg'" + ac_configure_args="$ac_configure_args '$ac_arg'" ;; esac done done -{ ac_configure_args0=; unset ac_configure_args0;} -{ ac_configure_args1=; unset ac_configure_args1;} +$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; } +$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; } # When interrupted or exit'd, cleanup temporary files, and complete # config.log. We remove comments because anyway the quotes in there @@ -1433,9 +1481,11 @@ trap 'exit_status=$? { echo - $as_echo "## ---------------- ## + cat <<\_ASBOX +## ---------------- ## ## Cache variables. ## -## ---------------- ##" +## ---------------- ## +_ASBOX echo # The following way of writing the cache mishandles newlines in values, ( @@ -1444,13 +1494,13 @@ trap 'exit_status=$? case $ac_val in #( *${as_nl}*) case $ac_var in #( - *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 + *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5 $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; esac case $ac_var in #( _ | IFS | as_nl) ;; #( BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( - *) { eval $ac_var=; unset $ac_var;} ;; + *) $as_unset $ac_var ;; esac ;; esac done @@ -1469,9 +1519,11 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; ) echo - $as_echo "## ----------------- ## + cat <<\_ASBOX +## ----------------- ## ## Output variables. ## -## ----------------- ##" +## ----------------- ## +_ASBOX echo for ac_var in $ac_subst_vars do @@ -1484,9 +1536,11 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; echo if test -n "$ac_subst_files"; then - $as_echo "## ------------------- ## + cat <<\_ASBOX +## ------------------- ## ## File substitutions. ## -## ------------------- ##" +## ------------------- ## +_ASBOX echo for ac_var in $ac_subst_files do @@ -1500,9 +1554,11 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; fi if test -s confdefs.h; then - $as_echo "## ----------- ## + cat <<\_ASBOX +## ----------- ## ## confdefs.h. ## -## ----------- ##" +## ----------- ## +_ASBOX echo cat confdefs.h echo @@ -1516,39 +1572,37 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; exit $exit_status ' 0 for ac_signal in 1 2 13 15; do - trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal + trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal done ac_signal=0 # confdefs.h avoids OS command line length limits that DEFS can exceed. rm -f -r conftest* confdefs.h -$as_echo "/* confdefs.h */" > confdefs.h - # Predefined preprocessor variables. cat >>confdefs.h <<_ACEOF #define PACKAGE_NAME "$PACKAGE_NAME" _ACEOF + cat >>confdefs.h <<_ACEOF #define PACKAGE_TARNAME "$PACKAGE_TARNAME" _ACEOF + cat >>confdefs.h <<_ACEOF #define PACKAGE_VERSION "$PACKAGE_VERSION" _ACEOF + cat >>confdefs.h <<_ACEOF #define PACKAGE_STRING "$PACKAGE_STRING" _ACEOF -cat >>confdefs.h <<_ACEOF -#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" -_ACEOF cat >>confdefs.h <<_ACEOF -#define PACKAGE_URL "$PACKAGE_URL" +#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" _ACEOF @@ -1557,12 +1611,7 @@ _ACEOF ac_site_file1=NONE ac_site_file2=NONE if test -n "$CONFIG_SITE"; then - # We do not want a PATH search for config.site. - case $CONFIG_SITE in #(( - -*) ac_site_file1=./$CONFIG_SITE;; - */*) ac_site_file1=$CONFIG_SITE;; - *) ac_site_file1=./$CONFIG_SITE;; - esac + ac_site_file1=$CONFIG_SITE elif test "x$prefix" != xNONE; then ac_site_file1=$prefix/share/config.site ac_site_file2=$prefix/etc/config.site @@ -1573,23 +1622,19 @@ fi for ac_site_file in "$ac_site_file1" "$ac_site_file2" do test "x$ac_site_file" = xNONE && continue - if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 + if test -r "$ac_site_file"; then + { $as_echo "$as_me:$LINENO: loading site script $ac_site_file" >&5 $as_echo "$as_me: loading site script $ac_site_file" >&6;} sed 's/^/| /' "$ac_site_file" >&5 - . "$ac_site_file" \ - || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -as_fn_error $? "failed to load site script $ac_site_file -See \`config.log' for more details" "$LINENO" 5; } + . "$ac_site_file" fi done if test -r "$cache_file"; then - # Some versions of bash will fail to source /dev/null (special files - # actually), so we avoid doing that. DJGPP emulates it as a regular file. - if test /dev/null != "$cache_file" && test -f "$cache_file"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 + # Some versions of bash will fail to source /dev/null (special + # files actually), so we avoid doing that. + if test -f "$cache_file"; then + { $as_echo "$as_me:$LINENO: loading cache $cache_file" >&5 $as_echo "$as_me: loading cache $cache_file" >&6;} case $cache_file in [\\/]* | ?:[\\/]* ) . "$cache_file";; @@ -1597,83 +1642,1384 @@ $as_echo "$as_me: loading cache $cache_file" >&6;} esac fi else - { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 -$as_echo "$as_me: creating cache $cache_file" >&6;} - >$cache_file + { $as_echo "$as_me:$LINENO: creating cache $cache_file" >&5 +$as_echo "$as_me: creating cache $cache_file" >&6;} + >$cache_file +fi + +# Check that the precious variables saved in the cache have kept the same +# value. +ac_cache_corrupted=false +for ac_var in $ac_precious_vars; do + eval ac_old_set=\$ac_cv_env_${ac_var}_set + eval ac_new_set=\$ac_env_${ac_var}_set + eval ac_old_val=\$ac_cv_env_${ac_var}_value + eval ac_new_val=\$ac_env_${ac_var}_value + case $ac_old_set,$ac_new_set in + set,) + { $as_echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,set) + { $as_echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,);; + *) + if test "x$ac_old_val" != "x$ac_new_val"; then + # differences in whitespace do not lead to failure. + ac_old_val_w=`echo x $ac_old_val` + ac_new_val_w=`echo x $ac_new_val` + if test "$ac_old_val_w" != "$ac_new_val_w"; then + { $as_echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5 +$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} + ac_cache_corrupted=: + else + { $as_echo "$as_me:$LINENO: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 +$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} + eval $ac_var=\$ac_old_val + fi + { $as_echo "$as_me:$LINENO: former value: \`$ac_old_val'" >&5 +$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} + { $as_echo "$as_me:$LINENO: current value: \`$ac_new_val'" >&5 +$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} + fi;; + esac + # Pass precious variables to config.status. + if test "$ac_new_set" = set; then + case $ac_new_val in + *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; + *) ac_arg=$ac_var=$ac_new_val ;; + esac + case " $ac_configure_args " in + *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. + *) ac_configure_args="$ac_configure_args '$ac_arg'" ;; + esac + fi +done +if $ac_cache_corrupted; then + { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} + { $as_echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5 +$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} + { { $as_echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5 +$as_echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;} + { (exit 1); exit 1; }; } +fi + + + + + + + + + + + + + + + + + + + + + + + + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. +set dummy ${ac_tool_prefix}gcc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="${ac_tool_prefix}gcc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_CC"; then + ac_ct_CC=$CC + # Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_ac_ct_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_ac_ct_CC="gcc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +else + CC="$ac_cv_prog_CC" +fi + +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. +set dummy ${ac_tool_prefix}cc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="${ac_tool_prefix}cc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + + fi +fi +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + ac_prog_rejected=no +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# != 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" + fi +fi +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + for ac_prog in cl.exe + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CC" && break + done +fi +if test -z "$CC"; then + ac_ct_CC=$CC + for ac_prog in cl.exe +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_ac_ct_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_ac_ct_CC="$ac_prog" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_CC" && break +done + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +fi + +fi + + +test -z "$CC" && { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: no acceptable C compiler found in \$PATH +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } + +# Provide some information about the compiler. +$as_echo "$as_me:$LINENO: checking for C compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +{ (ac_try="$ac_compiler --version >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compiler --version >&5") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } +{ (ac_try="$ac_compiler -v >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compiler -v >&5") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } +{ (ac_try="$ac_compiler -V >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compiler -V >&5") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } + +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" +# Try to create an executable without -o first, disregard a.out. +# It will help us diagnose broken compilers, and finding out an intuition +# of exeext. +{ $as_echo "$as_me:$LINENO: checking for C compiler default output file name" >&5 +$as_echo_n "checking for C compiler default output file name... " >&6; } +ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` + +# The possible output files: +ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" + +ac_rmfiles= +for ac_file in $ac_files +do + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + * ) ac_rmfiles="$ac_rmfiles $ac_file";; + esac +done +rm -f $ac_rmfiles + +if { (ac_try="$ac_link_default" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_link_default") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; then + # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. +# So ignore a value of `no', otherwise this would lead to `EXEEXT = no' +# in a Makefile. We should not override ac_cv_exeext if it was cached, +# so that the user can short-circuit this test for compilers unknown to +# Autoconf. +for ac_file in $ac_files '' +do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) + ;; + [ab].out ) + # We found the default executable, but exeext='' is most + # certainly right. + break;; + *.* ) + if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; + then :; else + ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + fi + # We set ac_cv_exeext here because the later test for it is not + # safe: cross compilers may not add the suffix if given an `-o' + # argument, so we may need to know it at that point already. + # Even if this section looks crufty: it has the advantage of + # actually working. + break;; + * ) + break;; + esac +done +test "$ac_cv_exeext" = no && ac_cv_exeext= + +else + ac_file='' +fi + +{ $as_echo "$as_me:$LINENO: result: $ac_file" >&5 +$as_echo "$ac_file" >&6; } +if test -z "$ac_file"; then + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: C compiler cannot create executables +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: C compiler cannot create executables +See \`config.log' for more details." >&2;} + { (exit 77); exit 77; }; }; } +fi + +ac_exeext=$ac_cv_exeext + +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:$LINENO: checking whether the C compiler works" >&5 +$as_echo_n "checking whether the C compiler works... " >&6; } +# FIXME: These cross compiler hacks should be removed for Autoconf 3.0 +# If not cross compiling, check that we can run a simple program. +if test "$cross_compiling" != yes; then + if { ac_try='./$ac_file' + { (case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + cross_compiling=no + else + if test "$cross_compiling" = maybe; then + cross_compiling=yes + else + { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } + fi + fi +fi +{ $as_echo "$as_me:$LINENO: result: yes" >&5 +$as_echo "yes" >&6; } + +rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out +ac_clean_files=$ac_clean_files_save +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:$LINENO: checking whether we are cross compiling" >&5 +$as_echo_n "checking whether we are cross compiling... " >&6; } +{ $as_echo "$as_me:$LINENO: result: $cross_compiling" >&5 +$as_echo "$cross_compiling" >&6; } + +{ $as_echo "$as_me:$LINENO: checking for suffix of executables" >&5 +$as_echo_n "checking for suffix of executables... " >&6; } +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; then + # If both `conftest.exe' and `conftest' are `present' (well, observable) +# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will +# work properly (i.e., refer to `conftest.exe'), while it won't with +# `rm'. +for ac_file in conftest.exe conftest conftest.*; do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + break;; + * ) break;; + esac +done +else + { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: cannot compute suffix of executables: cannot compile and link +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } +fi + +rm -f conftest$ac_cv_exeext +{ $as_echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5 +$as_echo "$ac_cv_exeext" >&6; } + +rm -f conftest.$ac_ext +EXEEXT=$ac_cv_exeext +ac_exeext=$EXEEXT +{ $as_echo "$as_me:$LINENO: checking for suffix of object files" >&5 +$as_echo_n "checking for suffix of object files... " >&6; } +if test "${ac_cv_objext+set}" = set; then + $as_echo_n "(cached) " >&6 +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.o conftest.obj +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; then + for ac_file in conftest.o conftest.obj conftest.*; do + test -f "$ac_file" || continue; + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;; + *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` + break;; + esac +done +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: cannot compute suffix of object files: cannot compile +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } +fi + +rm -f conftest.$ac_cv_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_objext" >&5 +$as_echo "$ac_cv_objext" >&6; } +OBJEXT=$ac_cv_objext +ac_objext=$OBJEXT +{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5 +$as_echo_n "checking whether we are using the GNU C compiler... " >&6; } +if test "${ac_cv_c_compiler_gnu+set}" = set; then + $as_echo_n "(cached) " >&6 +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ +#ifndef __GNUC__ + choke me +#endif + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_compiler_gnu=yes +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_compiler_gnu=no +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_c_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5 +$as_echo "$ac_cv_c_compiler_gnu" >&6; } +if test $ac_compiler_gnu = yes; then + GCC=yes +else + GCC= +fi +ac_test_CFLAGS=${CFLAGS+set} +ac_save_CFLAGS=$CFLAGS +{ $as_echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5 +$as_echo_n "checking whether $CC accepts -g... " >&6; } +if test "${ac_cv_prog_cc_g+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_save_c_werror_flag=$ac_c_werror_flag + ac_c_werror_flag=yes + ac_cv_prog_cc_g=no + CFLAGS="-g" + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_g=yes +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + CFLAGS="" + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + : +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_c_werror_flag=$ac_save_c_werror_flag + CFLAGS="-g" + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_g=yes +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_c_werror_flag=$ac_save_c_werror_flag +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5 +$as_echo "$ac_cv_prog_cc_g" >&6; } +if test "$ac_test_CFLAGS" = set; then + CFLAGS=$ac_save_CFLAGS +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi +{ $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5 +$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } +if test "${ac_cv_prog_cc_c89+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c89=no +ac_save_CC=$CC +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include +#include +#include +#include +/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ +struct buf { int x; }; +FILE * (*rcsopen) (struct buf *, struct stat *, int); +static char *e (p, i) + char **p; + int i; +{ + return p[i]; +} +static char *f (char * (*g) (char **, int), char **p, ...) +{ + char *s; + va_list v; + va_start (v,p); + s = g (p, va_arg (v,int)); + va_end (v); + return s; +} + +/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has + function prototypes and stuff, but not '\xHH' hex character constants. + These don't provoke an error unfortunately, instead are silently treated + as 'x'. The following induces an error, until -std is added to get + proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an + array size at least. It's necessary to write '\x00'==0 to get something + that's true only with -std. */ +int osf4_cc_array ['\x00' == 0 ? 1 : -1]; + +/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters + inside strings and character constants. */ +#define FOO(x) 'x' +int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; + +int test (int i, double x); +struct s1 {int (*f) (int a);}; +struct s2 {int (*f) (double a);}; +int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); +int argc; +char **argv; +int +main () +{ +return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; + ; + return 0; +} +_ACEOF +for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ + -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" +do + CC="$ac_save_CC $ac_arg" + rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_c89=$ac_arg +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c89" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c89" in + x) + { $as_echo "$as_me:$LINENO: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:$LINENO: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c89" + { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5 +$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; +esac + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + case $ac_cv_prog_cc_stdc in + no) ac_cv_prog_cc_c99=no; ac_cv_prog_cc_c89=no ;; + *) { $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C99" >&5 +$as_echo_n "checking for $CC option to accept ISO C99... " >&6; } +if test "${ac_cv_prog_cc_c99+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c99=no +ac_save_CC=$CC +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include +#include +#include +#include +#include + +// Check varargs macros. These examples are taken from C99 6.10.3.5. +#define debug(...) fprintf (stderr, __VA_ARGS__) +#define showlist(...) puts (#__VA_ARGS__) +#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__)) +static void +test_varargs_macros (void) +{ + int x = 1234; + int y = 5678; + debug ("Flag"); + debug ("X = %d\n", x); + showlist (The first, second, and third items.); + report (x>y, "x is %d but y is %d", x, y); +} + +// Check long long types. +#define BIG64 18446744073709551615ull +#define BIG32 4294967295ul +#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0) +#if !BIG_OK + your preprocessor is broken; +#endif +#if BIG_OK +#else + your preprocessor is broken; +#endif +static long long int bignum = -9223372036854775807LL; +static unsigned long long int ubignum = BIG64; + +struct incomplete_array +{ + int datasize; + double data[]; +}; + +struct named_init { + int number; + const wchar_t *name; + double average; +}; + +typedef const char *ccp; + +static inline int +test_restrict (ccp restrict text) +{ + // See if C++-style comments work. + // Iterate through items via the restricted pointer. + // Also check for declarations in for loops. + for (unsigned int i = 0; *(text+i) != '\0'; ++i) + continue; + return 0; +} + +// Check varargs and va_copy. +static void +test_varargs (const char *format, ...) +{ + va_list args; + va_start (args, format); + va_list args_copy; + va_copy (args_copy, args); + + const char *str; + int number; + float fnumber; + + while (*format) + { + switch (*format++) + { + case 's': // string + str = va_arg (args_copy, const char *); + break; + case 'd': // int + number = va_arg (args_copy, int); + break; + case 'f': // float + fnumber = va_arg (args_copy, double); + break; + default: + break; + } + } + va_end (args_copy); + va_end (args); +} + +int +main () +{ + + // Check bool. + _Bool success = false; + + // Check restrict. + if (test_restrict ("String literal") == 0) + success = true; + char *restrict newvar = "Another string"; + + // Check varargs. + test_varargs ("s, d' f .", "string", 65, 34.234); + test_varargs_macros (); + + // Check flexible array members. + struct incomplete_array *ia = + malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10)); + ia->datasize = 10; + for (int i = 0; i < ia->datasize; ++i) + ia->data[i] = i * 1.234; + + // Check named initializers. + struct named_init ni = { + .number = 34, + .name = L"Test wide string", + .average = 543.34343, + }; + + ni.number = 58; + + int dynamic_array[ni.number]; + dynamic_array[ni.number - 1] = 543; + + // work around unused variable warnings + return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x' + || dynamic_array[ni.number - 1] != 543); + + ; + return 0; +} +_ACEOF +for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -xc99=all -qlanglvl=extc99 +do + CC="$ac_save_CC $ac_arg" + rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_c99=$ac_arg +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c99" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c99" in + x) + { $as_echo "$as_me:$LINENO: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:$LINENO: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c99" + { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c99" >&5 +$as_echo "$ac_cv_prog_cc_c99" >&6; } ;; +esac +if test "x$ac_cv_prog_cc_c99" != xno; then + ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c99 +else + { $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5 +$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } +if test "${ac_cv_prog_cc_c89+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c89=no +ac_save_CC=$CC +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include +#include +#include +#include +/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ +struct buf { int x; }; +FILE * (*rcsopen) (struct buf *, struct stat *, int); +static char *e (p, i) + char **p; + int i; +{ + return p[i]; +} +static char *f (char * (*g) (char **, int), char **p, ...) +{ + char *s; + va_list v; + va_start (v,p); + s = g (p, va_arg (v,int)); + va_end (v); + return s; +} + +/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has + function prototypes and stuff, but not '\xHH' hex character constants. + These don't provoke an error unfortunately, instead are silently treated + as 'x'. The following induces an error, until -std is added to get + proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an + array size at least. It's necessary to write '\x00'==0 to get something + that's true only with -std. */ +int osf4_cc_array ['\x00' == 0 ? 1 : -1]; + +/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters + inside strings and character constants. */ +#define FOO(x) 'x' +int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; + +int test (int i, double x); +struct s1 {int (*f) (int a);}; +struct s2 {int (*f) (double a);}; +int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); +int argc; +char **argv; +int +main () +{ +return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; + ; + return 0; +} +_ACEOF +for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ + -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" +do + CC="$ac_save_CC $ac_arg" + rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_c89=$ac_arg +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c89" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c89" in + x) + { $as_echo "$as_me:$LINENO: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:$LINENO: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c89" + { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5 +$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; +esac +if test "x$ac_cv_prog_cc_c89" != xno; then + ac_cv_prog_cc_stdc=$ac_cv_prog_cc_c89 +else + ac_cv_prog_cc_stdc=no fi -# Check that the precious variables saved in the cache have kept the same -# value. -ac_cache_corrupted=false -for ac_var in $ac_precious_vars; do - eval ac_old_set=\$ac_cv_env_${ac_var}_set - eval ac_new_set=\$ac_env_${ac_var}_set - eval ac_old_val=\$ac_cv_env_${ac_var}_value - eval ac_new_val=\$ac_env_${ac_var}_value - case $ac_old_set,$ac_new_set in - set,) - { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 -$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} - ac_cache_corrupted=: ;; - ,set) - { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 -$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} - ac_cache_corrupted=: ;; - ,);; - *) - if test "x$ac_old_val" != "x$ac_new_val"; then - # differences in whitespace do not lead to failure. - ac_old_val_w=`echo x $ac_old_val` - ac_new_val_w=`echo x $ac_new_val` - if test "$ac_old_val_w" != "$ac_new_val_w"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 -$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} - ac_cache_corrupted=: - else - { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 -$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} - eval $ac_var=\$ac_old_val - fi - { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 -$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 -$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} - fi;; - esac - # Pass precious variables to config.status. - if test "$ac_new_set" = set; then - case $ac_new_val in - *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; - *) ac_arg=$ac_var=$ac_new_val ;; - esac - case " $ac_configure_args " in - *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. - *) as_fn_append ac_configure_args " '$ac_arg'" ;; - esac - fi -done -if $ac_cache_corrupted; then - { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} - { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 -$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} - as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5 + fi -## -------------------- ## -## Main body of script. ## -## -------------------- ## -ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu + ;; +esac + { $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO Standard C" >&5 +$as_echo_n "checking for $CC option to accept ISO Standard C... " >&6; } + if test "${ac_cv_prog_cc_stdc+set}" = set; then + $as_echo_n "(cached) " >&6 +fi + case $ac_cv_prog_cc_stdc in + no) { $as_echo "$as_me:$LINENO: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + '') { $as_echo "$as_me:$LINENO: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + *) { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_stdc" >&5 +$as_echo "$ac_cv_prog_cc_stdc" >&6; } ;; +esac # Check whether --with-htslib was given. -if test "${with_htslib+set}" = set; then : +if test "${with_htslib+set}" = set; then withval=$with_htslib; fi @@ -1717,13 +3063,13 @@ _ACEOF case $ac_val in #( *${as_nl}*) case $ac_var in #( - *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 + *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5 $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; esac case $ac_var in #( _ | IFS | as_nl) ;; #( BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( - *) { eval $ac_var=; unset $ac_var;} ;; + *) $as_unset $ac_var ;; esac ;; esac done @@ -1731,8 +3077,8 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; (set) 2>&1 | case $as_nl`(ac_space=' '; set) 2>&1` in #( *${as_nl}ac_space=\ *) - # `set' does not quote correctly, so add quotes: double-quote - # substitution turns \\\\ into \\, and sed turns \\ into \. + # `set' does not quote correctly, so add quotes (double-quote + # substitution turns \\\\ into \\, and sed turns \\ into \). sed -n \ "s/'/'\\\\''/g; s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" @@ -1754,23 +3100,12 @@ $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; :end' >>confcache if diff "$cache_file" confcache >/dev/null 2>&1; then :; else if test -w "$cache_file"; then - if test "x$cache_file" != "x/dev/null"; then - { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 + test "x$cache_file" != "x/dev/null" && + { $as_echo "$as_me:$LINENO: updating cache $cache_file" >&5 $as_echo "$as_me: updating cache $cache_file" >&6;} - if test ! -f "$cache_file" || test -h "$cache_file"; then - cat confcache >"$cache_file" - else - case $cache_file in #( - */* | ?:*) - mv -f confcache "$cache_file"$$ && - mv -f "$cache_file"$$ "$cache_file" ;; #( - *) - mv -f confcache "$cache_file" ;; - esac - fi - fi + cat confcache >$cache_file else - { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 + { $as_echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5 $as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} fi fi @@ -1820,15 +3155,14 @@ DEFS=`sed -n "$ac_script" confdefs.h` ac_libobjs= ac_ltlibobjs= -U= for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue # 1. Remove the extension, and $U if already installed. ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' ac_i=`$as_echo "$ac_i" | sed "$ac_script"` # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR # will be set to the directory where LIBOBJS objects are built. - as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext" - as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo' + ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext" + ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo' done LIBOBJS=$ac_libobjs @@ -1836,14 +3170,13 @@ LTLIBOBJS=$ac_ltlibobjs -: "${CONFIG_STATUS=./config.status}" +: ${CONFIG_STATUS=./config.status} ac_write_fail=0 ac_clean_files_save=$ac_clean_files ac_clean_files="$ac_clean_files $CONFIG_STATUS" -{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 +{ $as_echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5 $as_echo "$as_me: creating $CONFIG_STATUS" >&6;} -as_write_fail=0 -cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 +cat >$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 #! $SHELL # Generated by $as_me. # Run this file to recreate the current configuration. @@ -1853,18 +3186,17 @@ cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 debug=false ac_cs_recheck=false ac_cs_silent=false - SHELL=\${CONFIG_SHELL-$SHELL} -export SHELL -_ASEOF -cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1 -## -------------------- ## -## M4sh Initialization. ## -## -------------------- ## +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +## --------------------- ## +## M4sh Initialization. ## +## --------------------- ## # Be more Bourne compatible DUALCASE=1; export DUALCASE # for MKS sh -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then emulate sh NULLCMD=: # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which @@ -1872,15 +3204,23 @@ if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : alias -g '${1+"$@"}'='"$@"' setopt NO_GLOB_SUBST else - case `(set -o) 2>/dev/null` in #( - *posix*) : - set -o posix ;; #( - *) : - ;; + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; esac + fi + + +# PATH needs CR +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + as_nl=' ' export as_nl @@ -1888,13 +3228,7 @@ export as_nl as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo -# Prefer a ksh shell builtin over an external printf program on Solaris, -# but without wasting forks for bash or zsh. -if test -z "$BASH_VERSION$ZSH_VERSION" \ - && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='print -r --' - as_echo_n='print -rn --' -elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then +if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then as_echo='printf %s\n' as_echo_n='printf %s' else @@ -1905,7 +3239,7 @@ else as_echo_body='eval expr "X$1" : "X\\(.*\\)"' as_echo_n_body='eval arg=$1; - case $arg in #( + case $arg in *"$as_nl"*) expr "X$arg" : "X\\(.*\\)$as_nl"; arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; @@ -1928,6 +3262,13 @@ if test "${PATH_SEPARATOR+set}" != set; then } fi +# Support unset when possible. +if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then + as_unset=unset +else + as_unset=false +fi + # IFS # We need space, tab and new line, in precisely that order. Quoting is @@ -1937,16 +3278,15 @@ fi IFS=" "" $as_nl" # Find who we are. Look in the path if we contain no directory separator. -as_myself= -case $0 in #(( +case $0 in *[\\/]* ) as_myself=$0 ;; *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR for as_dir in $PATH do IFS=$as_save_IFS test -z "$as_dir" && as_dir=. - test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break - done + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break +done IFS=$as_save_IFS ;; @@ -1958,16 +3298,12 @@ if test "x$as_myself" = x; then fi if test ! -f "$as_myself"; then $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 - exit 1 + { (exit 1); exit 1; } fi -# Unset variables that we do not need and which cause bugs (e.g. in -# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" -# suppresses any "Segmentation fault" message there. '((' could -# trigger a bug in pdksh 5.2.14. -for as_var in BASH_ENV ENV MAIL MAILPATH -do eval test x\${$as_var+set} = xset \ - && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +# Work around bugs in pre-3.0 UWIN ksh. +for as_var in ENV MAIL MAILPATH +do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var done PS1='$ ' PS2='> ' @@ -1979,89 +3315,7 @@ export LC_ALL LANGUAGE=C export LANGUAGE -# CDPATH. -(unset CDPATH) >/dev/null 2>&1 && unset CDPATH - - -# as_fn_error STATUS ERROR [LINENO LOG_FD] -# ---------------------------------------- -# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are -# provided, also output the error to LOG_FD, referencing LINENO. Then exit the -# script with STATUS, using 1 if that was 0. -as_fn_error () -{ - as_status=$1; test $as_status -eq 0 && as_status=1 - if test "$4"; then - as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack - $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 - fi - $as_echo "$as_me: error: $2" >&2 - as_fn_exit $as_status -} # as_fn_error - - -# as_fn_set_status STATUS -# ----------------------- -# Set $? to STATUS, without forking. -as_fn_set_status () -{ - return $1 -} # as_fn_set_status - -# as_fn_exit STATUS -# ----------------- -# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. -as_fn_exit () -{ - set +e - as_fn_set_status $1 - exit $1 -} # as_fn_exit - -# as_fn_unset VAR -# --------------- -# Portably unset VAR. -as_fn_unset () -{ - { eval $1=; unset $1;} -} -as_unset=as_fn_unset -# as_fn_append VAR VALUE -# ---------------------- -# Append the text in VALUE to the end of the definition contained in VAR. Take -# advantage of any shell optimizations that allow amortized linear growth over -# repeated appends, instead of the typical quadratic growth present in naive -# implementations. -if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : - eval 'as_fn_append () - { - eval $1+=\$2 - }' -else - as_fn_append () - { - eval $1=\$$1\$2 - } -fi # as_fn_append - -# as_fn_arith ARG... -# ------------------ -# Perform arithmetic evaluation on the ARGs, and store the result in the -# global $as_val. Take advantage of shells that can avoid forks. The arguments -# must be portable across $(()) and expr. -if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : - eval 'as_fn_arith () - { - as_val=$(( $* )) - }' -else - as_fn_arith () - { - as_val=`expr "$@" || test $? -eq 1` - } -fi # as_fn_arith - - +# Required to use basename. if expr a : '\(a\)' >/dev/null 2>&1 && test "X`expr 00001 : '.*\(...\)'`" = X001; then as_expr=expr @@ -2075,12 +3329,8 @@ else as_basename=false fi -if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then - as_dirname=dirname -else - as_dirname=false -fi +# Name of the executable. as_me=`$as_basename -- "$0" || $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ X"$0" : 'X\(//\)$' \| \ @@ -2100,25 +3350,76 @@ $as_echo X/"$0" | } s/.*/./; q'` -# Avoid depending upon Character Ranges. -as_cr_letters='abcdefghijklmnopqrstuvwxyz' -as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' -as_cr_Letters=$as_cr_letters$as_cr_LETTERS -as_cr_digits='0123456789' -as_cr_alnum=$as_cr_Letters$as_cr_digits +# CDPATH. +$as_unset CDPATH + + + + as_lineno_1=$LINENO + as_lineno_2=$LINENO + test "x$as_lineno_1" != "x$as_lineno_2" && + test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || { + + # Create $as_me.lineno as a copy of $as_myself, but with $LINENO + # uniformly replaced by the line number. The first 'sed' inserts a + # line-number line after each line using $LINENO; the second 'sed' + # does the real work. The second script uses 'N' to pair each + # line-number line with the line containing $LINENO, and appends + # trailing '-' during substitution so that $LINENO is not a special + # case at line end. + # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the + # scripts with optimization help from Paolo Bonzini. Blame Lee + # E. McMahon (1931-1989) for sed's syntax. :-) + sed -n ' + p + /[$]LINENO/= + ' <$as_myself | + sed ' + s/[$]LINENO.*/&-/ + t lineno + b + :lineno + N + :loop + s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ + t loop + s/-\n.*// + ' >$as_me.lineno && + chmod +x "$as_me.lineno" || + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 + { (exit 1); exit 1; }; } + + # Don't try to exec as it changes $[0], causing all sort of problems + # (the dirname of $[0] is not the place where we might find the + # original and so on. Autoconf is especially sensitive to this). + . "./$as_me.lineno" + # Exit status is that of the last command. + exit +} + + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi ECHO_C= ECHO_N= ECHO_T= -case `echo -n x` in #((((( +case `echo -n x` in -n*) - case `echo 'xy\c'` in + case `echo 'x\c'` in *c*) ECHO_T=' ';; # ECHO_T is single tab character. - xy) ECHO_C='\c';; - *) echo `echo ksh88 bug on AIX 6.1` > /dev/null - ECHO_T=' ';; + *) ECHO_C='\c';; esac;; *) ECHO_N='-n';; esac +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi rm -f conf$$ conf$$.exe conf$$.file if test -d conf$$.dir; then @@ -2133,85 +3434,49 @@ if (echo >conf$$.file) 2>/dev/null; then # ... but there are two gotchas: # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. - # In both cases, we have to default to `cp -pR'. + # In both cases, we have to default to `cp -p'. ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || - as_ln_s='cp -pR' + as_ln_s='cp -p' elif ln conf$$.file conf$$ 2>/dev/null; then as_ln_s=ln else - as_ln_s='cp -pR' + as_ln_s='cp -p' fi else - as_ln_s='cp -pR' + as_ln_s='cp -p' fi rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file rmdir conf$$.dir 2>/dev/null - -# as_fn_mkdir_p -# ------------- -# Create "$as_dir" as a directory, including parents if necessary. -as_fn_mkdir_p () -{ - - case $as_dir in #( - -*) as_dir=./$as_dir;; - esac - test -d "$as_dir" || eval $as_mkdir_p || { - as_dirs= - while :; do - case $as_dir in #( - *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( - *) as_qdir=$as_dir;; - esac - as_dirs="'$as_qdir' $as_dirs" - as_dir=`$as_dirname -- "$as_dir" || -$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$as_dir" : 'X\(//\)[^/]' \| \ - X"$as_dir" : 'X\(//\)$' \| \ - X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$as_dir" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ - s//\1/ - q - } - /^X\(\/\/\)$/{ - s//\1/ - q - } - /^X\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - test -d "$as_dir" && break - done - test -z "$as_dirs" || eval "mkdir $as_dirs" - } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" - - -} # as_fn_mkdir_p if mkdir -p . 2>/dev/null; then - as_mkdir_p='mkdir -p "$as_dir"' + as_mkdir_p=: else test -d ./-p && rmdir ./-p as_mkdir_p=false fi - -# as_fn_executable_p FILE -# ----------------------- -# Test if FILE is an executable regular file. -as_fn_executable_p () -{ - test -f "$1" && test -x "$1" -} # as_fn_executable_p -as_test_x='test -x' -as_executable_p=as_fn_executable_p +if test -x / >/dev/null 2>&1; then + as_test_x='test -x' +else + if ls -dL / >/dev/null 2>&1; then + as_ls_L_option=L + else + as_ls_L_option= + fi + as_test_x=' + eval sh -c '\'' + if test -d "$1"; then + test -d "$1/."; + else + case $1 in + -*)set "./$1";; + esac; + case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in + ???[sx]*):;;*)false;;esac;fi + '\'' sh + ' +fi +as_executable_p=$as_test_x # Sed expression to map a string onto a valid CPP name. as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" @@ -2221,19 +3486,13 @@ as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" exec 6>&1 -## ----------------------------------- ## -## Main body of $CONFIG_STATUS script. ## -## ----------------------------------- ## -_ASEOF -test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1 -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -# Save the log message, to keep $0 and so on meaningful, and to +# Save the log message, to keep $[0] and so on meaningful, and to # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" This file was extended by gemBS_utils $as_me 1.0, which was -generated by GNU Autoconf 2.69. Invocation command line was +generated by GNU Autoconf 2.63. Invocation command line was CONFIG_FILES = $CONFIG_FILES CONFIG_HEADERS = $CONFIG_HEADERS @@ -2260,15 +3519,13 @@ _ACEOF cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 ac_cs_usage="\ -\`$as_me' instantiates files and other configuration actions -from templates according to the current configuration. Unless the files -and actions are specified as TAGs, all are instantiated by default. +\`$as_me' instantiates files from templates according to the +current configuration. -Usage: $0 [OPTION]... [TAG]... +Usage: $0 [OPTION]... [FILE]... -h, --help print this help, then exit -V, --version print version number and configuration settings, then exit - --config print configuration, then exit -q, --quiet, --silent do not print progress messages -d, --debug don't remove temporary files @@ -2279,17 +3536,16 @@ Usage: $0 [OPTION]... [TAG]... Configuration files: $config_files -Report bugs to the package provider." +Report bugs to ." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ gemBS_utils config.status 1.0 -configured by $0, generated by GNU Autoconf 2.69, - with options \\"\$ac_cs_config\\" +configured by $0, generated by GNU Autoconf 2.63, + with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" -Copyright (C) 2012 Free Software Foundation, Inc. +Copyright (C) 2008 Free Software Foundation, Inc. This config.status script is free software; the Free Software Foundation gives unlimited permission to copy, distribute and modify it." @@ -2304,16 +3560,11 @@ ac_need_defaults=: while test $# != 0 do case $1 in - --*=?*) + --*=*) ac_option=`expr "X$1" : 'X\([^=]*\)='` ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` ac_shift=: ;; - --*=) - ac_option=`expr "X$1" : 'X\([^=]*\)='` - ac_optarg= - ac_shift=: - ;; *) ac_option=$1 ac_optarg=$2 @@ -2327,17 +3578,14 @@ do ac_cs_recheck=: ;; --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) $as_echo "$ac_cs_version"; exit ;; - --config | --confi | --conf | --con | --co | --c ) - $as_echo "$ac_cs_config"; exit ;; --debug | --debu | --deb | --de | --d | -d ) debug=: ;; --file | --fil | --fi | --f ) $ac_shift case $ac_optarg in *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; - '') as_fn_error $? "missing file argument" ;; esac - as_fn_append CONFIG_FILES " '$ac_optarg'" + CONFIG_FILES="$CONFIG_FILES '$ac_optarg'" ac_need_defaults=false;; --he | --h | --help | --hel | -h ) $as_echo "$ac_cs_usage"; exit ;; @@ -2346,10 +3594,11 @@ do ac_cs_silent=: ;; # This is an error. - -*) as_fn_error $? "unrecognized option: \`$1' -Try \`$0 --help' for more information." ;; + -*) { $as_echo "$as_me: error: unrecognized option: $1 +Try \`$0 --help' for more information." >&2 + { (exit 1); exit 1; }; } ;; - *) as_fn_append ac_config_targets " $1" + *) ac_config_targets="$ac_config_targets $1" ac_need_defaults=false ;; esac @@ -2366,7 +3615,7 @@ fi _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 if \$ac_cs_recheck; then - set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion + set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion shift \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 CONFIG_SHELL='$SHELL' @@ -2397,7 +3646,9 @@ do case $ac_config_target in "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; - *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; + *) { { $as_echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5 +$as_echo "$as_me: error: invalid argument: $ac_config_target" >&2;} + { (exit 1); exit 1; }; };; esac done @@ -2418,24 +3669,26 @@ fi # after its creation but before its name has been assigned to `$tmp'. $debug || { - tmp= ac_tmp= + tmp= trap 'exit_status=$? - : "${ac_tmp:=$tmp}" - { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status + { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status ' 0 - trap 'as_fn_exit 1' 1 2 13 15 + trap '{ (exit 1); exit 1; }' 1 2 13 15 } # Create a (secure) tmp directory for tmp files. { tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && - test -d "$tmp" + test -n "$tmp" && test -d "$tmp" } || { tmp=./conf$$-$RANDOM (umask 077 && mkdir "$tmp") -} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5 -ac_tmp=$tmp +} || +{ + $as_echo "$as_me: cannot create a temporary directory in ." >&2 + { (exit 1); exit 1; } +} # Set up the scripts for CONFIG_FILES section. # No need to generate them if there are no CONFIG_FILES. @@ -2443,13 +3696,7 @@ ac_tmp=$tmp if test -n "$CONFIG_FILES"; then -ac_cr=`echo X | tr X '\015'` -# On cygwin, bash can eat \r inside `` if the user requested igncr. -# But we know of no other shell where ac_cr would be empty at this -# point, so we can use a bashism as a fallback. -if test "x$ac_cr" = x; then - eval ac_cr=\$\'\\r\' -fi +ac_cr=' ' ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' /dev/null` if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then ac_cs_awk_cr='\\r' @@ -2457,7 +3704,7 @@ else ac_cs_awk_cr=$ac_cr fi -echo 'BEGIN {' >"$ac_tmp/subs1.awk" && +echo 'BEGIN {' >"$tmp/subs1.awk" && _ACEOF @@ -2466,18 +3713,24 @@ _ACEOF echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && echo "_ACEOF" } >conf$$subs.sh || - as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 -ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'` + { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } +ac_delim_num=`echo "$ac_subst_vars" | grep -c '$'` ac_delim='%!_!# ' for ac_last_try in false false false false false :; do . ./conf$$subs.sh || - as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` if test $ac_delim_n = $ac_delim_num; then break elif $ac_last_try; then - as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } else ac_delim="$ac_delim!$ac_delim _$ac_delim!! " fi @@ -2485,7 +3738,7 @@ done rm -f conf$$subs.sh cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK && +cat >>"\$tmp/subs1.awk" <<\\_ACAWK && _ACEOF sed -n ' h @@ -2499,7 +3752,7 @@ s/'"$ac_delim"'$// t delim :nl h -s/\(.\{148\}\)..*/\1/ +s/\(.\{148\}\).*/\1/ t more1 s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ p @@ -2513,7 +3766,7 @@ s/.\{148\}// t nl :delim h -s/\(.\{148\}\)..*/\1/ +s/\(.\{148\}\).*/\1/ t more2 s/["\\]/\\&/g; s/^/"/; s/$/"/ p @@ -2533,7 +3786,7 @@ t delim rm -f conf$$subs.awk cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 _ACAWK -cat >>"\$ac_tmp/subs1.awk" <<_ACAWK && +cat >>"\$tmp/subs1.awk" <<_ACAWK && for (key in S) S_is_set[key] = 1 FS = "" @@ -2565,29 +3818,23 @@ if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" else cat -fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \ - || as_fn_error $? "could not setup config files machinery" "$LINENO" 5 +fi < "$tmp/subs1.awk" > "$tmp/subs.awk" \ + || { { $as_echo "$as_me:$LINENO: error: could not setup config files machinery" >&5 +$as_echo "$as_me: error: could not setup config files machinery" >&2;} + { (exit 1); exit 1; }; } _ACEOF -# VPATH may cause trouble with some makes, so we remove sole $(srcdir), -# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and +# VPATH may cause trouble with some makes, so we remove $(srcdir), +# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and # trailing colons and then remove the whole line if VPATH becomes empty # (actually we leave an empty line to preserve line numbers). if test "x$srcdir" = x.; then - ac_vpsub='/^[ ]*VPATH[ ]*=[ ]*/{ -h -s/// -s/^/:/ -s/[ ]*$/:/ -s/:\$(srcdir):/:/g -s/:\${srcdir}:/:/g -s/:@srcdir@:/:/g -s/^:*// + ac_vpsub='/^[ ]*VPATH[ ]*=/{ +s/:*\$(srcdir):*/:/ +s/:*\${srcdir}:*/:/ +s/:*@srcdir@:*/:/ +s/^\([^=]*=[ ]*\):*/\1/ s/:*$// -x -s/\(=[ ]*\).*/\1/ -G -s/\n// s/^[^=]*=[ ]*$// }' fi @@ -2605,7 +3852,9 @@ do esac case $ac_mode$ac_tag in :[FHL]*:*);; - :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;; + :L* | :C*:*) { { $as_echo "$as_me:$LINENO: error: invalid tag $ac_tag" >&5 +$as_echo "$as_me: error: invalid tag $ac_tag" >&2;} + { (exit 1); exit 1; }; };; :[FH]-) ac_tag=-:-;; :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; esac @@ -2624,7 +3873,7 @@ do for ac_f do case $ac_f in - -) ac_f="$ac_tmp/stdin";; + -) ac_f="$tmp/stdin";; *) # Look for the file first in the build tree, then in the source tree # (if the path is not absolute). The absolute path cannot be DOS-style, # because $ac_f cannot contain `:'. @@ -2633,10 +3882,12 @@ do [\\/$]*) false;; *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; esac || - as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; + { { $as_echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5 +$as_echo "$as_me: error: cannot find input file: $ac_f" >&2;} + { (exit 1); exit 1; }; };; esac case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac - as_fn_append ac_file_inputs " '$ac_f'" + ac_file_inputs="$ac_file_inputs '$ac_f'" done # Let's still pretend it is `configure' which instantiates (i.e., don't @@ -2647,7 +3898,7 @@ do `' by configure.' if test x"$ac_file" != x-; then configure_input="$ac_file. $configure_input" - { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 + { $as_echo "$as_me:$LINENO: creating $ac_file" >&5 $as_echo "$as_me: creating $ac_file" >&6;} fi # Neutralize special characters interpreted by sed in replacement strings. @@ -2659,8 +3910,10 @@ $as_echo "$as_me: creating $ac_file" >&6;} esac case $ac_tag in - *:-:* | *:-) cat >"$ac_tmp/stdin" \ - || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; + *:-:* | *:-) cat >"$tmp/stdin" \ + || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 +$as_echo "$as_me: error: could not create $ac_file" >&2;} + { (exit 1); exit 1; }; } ;; esac ;; esac @@ -2688,7 +3941,47 @@ $as_echo X"$ac_file" | q } s/.*/./; q'` - as_dir="$ac_dir"; as_fn_mkdir_p + { as_dir="$ac_dir" + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || { { $as_echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5 +$as_echo "$as_me: error: cannot create directory $as_dir" >&2;} + { (exit 1); exit 1; }; }; } ac_builddir=. case "$ac_dir" in @@ -2736,6 +4029,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # If the template does not know about datarootdir, expand it. # FIXME: This hack should be removed a few years after 2.60. ac_datarootdir_hack=; ac_datarootdir_seen= + ac_sed_dataroot=' /datarootdir/ { p @@ -2745,11 +4039,12 @@ ac_sed_dataroot=' /@docdir@/p /@infodir@/p /@localedir@/p -/@mandir@/p' +/@mandir@/p +' case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in *datarootdir*) ac_datarootdir_seen=yes;; *@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 + { $as_echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 $as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 @@ -2759,7 +4054,7 @@ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 s&@infodir@&$infodir&g s&@localedir@&$localedir&g s&@mandir@&$mandir&g - s&\\\${datarootdir}&$datarootdir&g' ;; + s&\\\${datarootdir}&$datarootdir&g' ;; esac _ACEOF @@ -2785,24 +4080,27 @@ s&@abs_builddir@&$ac_abs_builddir&;t t s&@abs_top_builddir@&$ac_abs_top_builddir&;t t $ac_datarootdir_hack " -eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \ - >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5 +eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$tmp/subs.awk" >$tmp/out \ + || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 +$as_echo "$as_me: error: could not create $ac_file" >&2;} + { (exit 1); exit 1; }; } test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && - { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && - { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ - "$ac_tmp/out"`; test -z "$ac_out"; } && - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' -which seems to be undefined. Please make sure it is defined" >&5 + { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } && + { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } && + { $as_echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined." >&5 $as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' -which seems to be undefined. Please make sure it is defined" >&2;} +which seems to be undefined. Please make sure it is defined." >&2;} - rm -f "$ac_tmp/stdin" + rm -f "$tmp/stdin" case $ac_file in - -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";; - *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";; + -) cat "$tmp/out" && rm -f "$tmp/out";; + *) rm -f "$ac_file" && mv "$tmp/out" "$ac_file";; esac \ - || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 +$as_echo "$as_me: error: could not create $ac_file" >&2;} + { (exit 1); exit 1; }; } ;; @@ -2812,12 +4110,15 @@ which seems to be undefined. Please make sure it is defined" >&2;} done # for ac_tag -as_fn_exit 0 +{ (exit 0); exit 0; } _ACEOF +chmod +x $CONFIG_STATUS ac_clean_files=$ac_clean_files_save test $ac_write_fail = 0 || - as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5 + { { $as_echo "$as_me:$LINENO: error: write failure creating $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: write failure creating $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } # configure is writing to config.log, and then calls config.status. @@ -2838,10 +4139,10 @@ if test "$no_create" != yes; then exec 5>>config.log # Use ||, not &&, to avoid exiting from the if with $? = 1, which # would make configure fail if this is the last instruction. - $ac_cs_success || as_fn_exit 1 + $ac_cs_success || { (exit 1); exit 1; } fi if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then - { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 + { $as_echo "$as_me:$LINENO: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} fi diff --git a/tools/utils/configure.ac b/tools/utils/configure.ac index 69a50a42..d19c434f 100644 --- a/tools/utils/configure.ac +++ b/tools/utils/configure.ac @@ -1,5 +1,6 @@ AC_INIT([gemBS_utils],[1.0]) +AC_PROG_CC_STDC AC_ARG_WITH(htslib, [AS_HELP_STRING([--with-htslib=PATH], [specify prefix directory for installed htslib library.])]) From 3bed486a707671e452642b4e357d9316e36db22b Mon Sep 17 00:00:00 2001 From: Simon Heath Date: Sat, 21 Mar 2020 10:14:41 +0100 Subject: [PATCH 61/61] Bump version to 3.5.1 --- README.md | 2 ++ gemBS/version.py | 2 +- tools/bs_call | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2f6fe7e0..82f2f4e6 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,8 @@ Documentation can be found at ---------- Changelog: ---------- + 3.5.1 Check if C compiler requires --std=c99 flag for standards conformant behaviour + 3.5.1 Make sure bgzip is copied correctly during installation 3.5.0 Make bs_call process contig pools from largest to smallest (this change alters the sqlite db format so if you have a previously started gemBS run you should (a) remove the .gemBS directory, (b) redo the 'gemBS prepare' step to recreate the db file and (3) run 'gemBS db-sync'. diff --git a/gemBS/version.py b/gemBS/version.py index 1fbbc838..1525de63 100644 --- a/gemBS/version.py +++ b/gemBS/version.py @@ -1,4 +1,4 @@ __VERSION_MAJOR = "3" __VERSION_MINOR = "5" -__VERSION_SUBMINOR = "0" +__VERSION_SUBMINOR = "1" __VERSION__ = "%s.%s.%s" % (__VERSION_MAJOR, __VERSION_MINOR,__VERSION_SUBMINOR) diff --git a/tools/bs_call b/tools/bs_call index 8592d85a..c172df48 160000 --- a/tools/bs_call +++ b/tools/bs_call @@ -1 +1 @@ -Subproject commit 8592d85a2fe58ec58f59c630728153694c19bbc4 +Subproject commit c172df48590c360f218c0fe958ff64ae5e92a5ec