From 8c92e3cca83a011f555ed70e9f851784f1b19171 Mon Sep 17 00:00:00 2001 From: Garth Kong Date: Thu, 2 Sep 2021 14:05:50 -0700 Subject: [PATCH] update consensus peak adjust downstream rules after consensus peaks --- Snakefile | 30 +++++++++---------- scripts/consensus_peaks.R | 60 -------------------------------------- scripts/consensus_peaks.sh | 32 ++++++++++++++++++++ scripts/deseq2.R | 2 +- scripts/diffbind.R | 4 +-- scripts/homer.sh | 9 ++++-- 6 files changed, 56 insertions(+), 81 deletions(-) delete mode 100644 scripts/consensus_peaks.R create mode 100644 scripts/consensus_peaks.sh diff --git a/Snakefile b/Snakefile index dd8a07a..9d372be 100644 --- a/Snakefile +++ b/Snakefile @@ -33,7 +33,7 @@ localrules: fraglength_plot, FRiP, counts_table, multiqc, homer rule all: input: - # quality control ----------------------------------------------------------------- + # quality control expand("data/fastp/{sample}_{read}.fastq.gz", sample = SAMPLES, read = ["R1", "R2"]), expand("data/fastqc/{reads}_fastqc.html", reads = all_reads), expand("data/fastq_screen/{sample}_{read}_screen.txt", sample = SAMPLES, read = ["R1", "R2"]), @@ -42,14 +42,14 @@ rule all: "data/multiqc/multiqc_report.html", "data/fraglen.html", "data/frip.html", - # read alignment ------------------------------------------------------------------ + # read alignment expand("data/banlist/{sample}.banlist.filtered.rmdup.sorted.bam", sample = SAMPLES), expand("data/bigwig/{sample}.bw", sample = SAMPLES), - # peak calling -------------------------------------------------------------------- - expand("data/macs2/{sample}/{sample}_peaks.broadPeak", sample = SAMPLES), + # peak calling + expand("data/macs2/{sample}_peaks.broadPeak", sample = SAMPLES), "data/macs2/consensus_peaks.bed", "data/counts/counts_table.txt", - # differential -------------------------------------------------------------------- + # differential "data/deseq2", "data/diffbind", "data/homer" @@ -256,30 +256,29 @@ rule macs2: bam = "data/banlist/{sample}.banlist.filtered.rmdup.sorted.bam", bai = "data/banlist/{sample}.banlist.filtered.rmdup.sorted.bam.bai" output: - "data/macs2/{sample}/{sample}_peaks.broadPeak" + "data/macs2/{sample}_peaks.broadPeak" conda: "envs/macs2.yaml" params: - outdir = "data/macs2/{sample}", genome_size = config["GSIZE"] log: "data/logs/macs2_{sample}.log" shell: "macs2 callpeak -t {input.bam} -n {wildcards.sample} " - "--format BAMPE --gsize {params.genome_size} " - "--outdir {params.outdir} --broad > {log} 2>&1" + "--format BAMPE --gsize {config[GSIZE]} " + "--outdir data/macs2 --broad > {log} 2>&1" rule consensus: input: - expand("data/macs2/{sample}/{sample}_peaks.broadPeak", sample = SAMPLES) + expand("data/macs2/{sample}_peaks.broadPeak", sample = SAMPLES) output: "data/macs2/consensus_peaks.bed" params: n_intersects = config["N_INTERSECTS"] conda: - "envs/consensus.yaml" - script: - "scripts/consensus_peaks.R" + "envs/bedtools.yaml" + shell: + "bash scripts/consensus_peaks.sh {params.n_intersects} {input} > {output}" rule counts: input: @@ -294,16 +293,17 @@ rule counts: rule counts_table: input: - expand("data/multicov/{sample}.txt", sample = SAMPLES) + expand("data/multicov/{sample}.txt", sample = sorted(SAMPLES)) output: "data/counts/counts_table.txt" run: dfs = [] for file in list(input): sample_name = os.path.basename(file).split('.')[0] - dfs.append( pd.read_csv(file, sep = "\t", names = ["chr", "start", "end", "name", sample_name]) ) + dfs.append( pd.read_csv(file, sep = "\t", names = ["chr", "start", "end", sample_name]) ) df = pd.concat(dfs, axis = 1) df = df.loc[:,~df.columns.duplicated()] + df.insert( 3, "name", ["peak" + str(i) for i in df.index] ) df.to_csv(str(output), header = True, index = False, sep = "\t") # differential testing ---------------------------------------------------------------------------- diff --git a/scripts/consensus_peaks.R b/scripts/consensus_peaks.R deleted file mode 100644 index a4f6daa..0000000 --- a/scripts/consensus_peaks.R +++ /dev/null @@ -1,60 +0,0 @@ -library(dplyr) -library(stringr) -library(GenomicRanges) -library(plyranges) -library(tidyr) - -# for each condition, read in peaks containing all samples. -list_of_peak_files <- lapply(unlist(snakemake@input), - read.delim, - header = FALSE, - col.names = c("seqnames", "start", - "end", "name", - "score", "strand", - "signalValue", "pval", - "qval", "peak")) -all_peaks_df <- bind_rows(list_of_peak_files) - -# define condition and replicate. -all_peaks_df <- all_peaks_df %>% - separate(name, "condrep", sep = "_", remove=FALSE, extra = "drop") %>% - mutate(rep = str_sub(condrep, start = -1)) %>% - mutate(cond = str_sub(condrep, start = 1, end = -2)) %>% - mutate(condrep = NULL) %>% - mutate(strand = "*") - -# split the df into list of GRanges by condition. compute coverage for each condition. -peaks_by_condition <- makeGRangesListFromDataFrame(all_peaks_df, split.field = "cond") - -# find peaks present in this many or more samples in one condition. -presence_in_samples <- snakemake@params$n_intersects - -consensus_peaks <- GRangesList() -for (condition in names(peaks_by_condition)) { - overlaps_by_condition <- compute_coverage(peaks_by_condition[condition]) %>% - filter(score >= presence_in_samples) %>% - reduce_ranges() - - if (length(overlaps_by_condition) == 0) { - next - } else { - seqlengths(overlaps_by_condition) <- rep(NA,length(seqlengths(overlaps_by_condition))) # set seqlengths as NA - consensus_peaks[[condition]] <- overlaps_by_condition - } -} - -# You can see which conditions contributed to the consensus peaks with the following! -# unlist(consensus_peaks) - -# merge overlapping peaks across conds, turn list of GRanges into df, rm blacklist regions, export. -consensus_df <- unlist(consensus_peaks) %>% reduce_ranges() -consensus_df <- as.data.frame(consensus_df, row.names = NULL) -consensus_df <- consensus_df %>% - select(seqnames, start, end) %>% - mutate(name = paste("consensus_peak", row_number(), sep = "_")) %>% - filter(start < end) - -write.table(x = consensus_df, - file = snakemake@output[[1]], sep = "\t", - row.names = FALSE, col.names = FALSE, - quote = FALSE) \ No newline at end of file diff --git a/scripts/consensus_peaks.sh b/scripts/consensus_peaks.sh new file mode 100644 index 0000000..298787b --- /dev/null +++ b/scripts/consensus_peaks.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# import + check args +n_intersects=$1 +all_inputs=$(echo "${@:2}") + +if ! [[ "$n_intersects" =~ ^[0-9]+$ ]]; then + echo "First argument must be an integer. Your input: $n_intersects" + exit 1 +fi + +all_conditions=$(echo $all_inputs | tr ' ' '\n' | cut -d/ -f3 | cut -d_ -f1 | sort | uniq) + +for condition in $all_conditions; do + + # file I/O + tmp_output="data/macs2/$condition.tmp.bed" + + # list all replicates in one condition + all_replicates=$(find data/macs2/ -name "*$condition*.broadPeak" | sort | tr '\n' ' ') + + # find widest peak that appear in at least n replicates + export to tmp file. + cat $all_replicates | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge | \ + bedtools intersect -c -a stdin -b $all_replicates | \ + awk -v n=$n_intersects '$4 >= n' | awk -v OFS='\t' '{print $1,$2,$3}' > $tmp_output + +done + +# merge intervals for all tmp files and export as consensus peak. +all_temp_files=$(find data/macs2 -name "*.tmp.bed" | sort | tr '\n' ' ') +cat $all_temp_files | sort -k1,1 -k2,2n | bedtools merge +rm $all_temp_files \ No newline at end of file diff --git a/scripts/deseq2.R b/scripts/deseq2.R index 031e803..8be96c1 100644 --- a/scripts/deseq2.R +++ b/scripts/deseq2.R @@ -135,4 +135,4 @@ all_sig_intervals <- all_sig_intervals %>% rename(name = "name") write.table(de_stats, snakemake@output[['stats']], quote = FALSE, sep = "\t", row.names = F, col.names = T) -write.table(all_sig_intervals, snakemake@output[['all_sig_intervals']], quote = FALSE, sep = "\t", row.names = F, col.names = F) \ No newline at end of file +write.table(all_sig_intervals, snakemake@output[['all_sig_intervals']], quote = FALSE, sep = "\t", row.names = F, col.names = F) diff --git a/scripts/diffbind.R b/scripts/diffbind.R index e845a9e..076a012 100644 --- a/scripts/diffbind.R +++ b/scripts/diffbind.R @@ -13,7 +13,7 @@ if (dir.exists("data/diffbind")) { dir.create("data/diffbind") } -consensus_peaks <- GRanges(read.table(snakemake@input[["consensus_peaks"]], col.names=c("seqnames", "start", "end", "name"))) +consensus_peaks <- GRanges(read.table(snakemake@input[["consensus_peaks"]], col.names=c("seqnames", "start", "end")) %>% mutate(name = paste0("peak", row_number())) ) ATAC <-dba(sampleSheet=snakemake@input[["metadata"]]) ATAC <- dba.count(ATAC, peaks=consensus_peaks) @@ -23,8 +23,6 @@ ATAC <- dba.analyze(ATAC) # loop through all contrasts and export results dba_meta <- dba.show(ATAC, bContrasts=TRUE) -dba_meta[,c("Group", "Group2")] %>% - write.table(., snakemake@output[["contrast_combinations"]], sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) # save.image() for (i in 1:nrow(dba_meta)) { diff --git a/scripts/homer.sh b/scripts/homer.sh index f53987b..421c244 100644 --- a/scripts/homer.sh +++ b/scripts/homer.sh @@ -12,6 +12,7 @@ done # reset current homer results if [ -d "data/homer" ]; then rm -r "data/homer" + mkdir "data/homer" fi # check if input files exists. @@ -77,14 +78,18 @@ do echo "ERROR: $up_peaks had less than 10 DE intervals." else echo "Running HOMER for $up_peaks_count up peaks in $up_peaks" - sbatch --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $up_peaks $g data/homer/$contrast-up -size 200 > $up_log 2>&1" & + job_out="jobs/out/homer-$contrast.out" + job_err="jobs/error/homer-$contrast.err" + sbatch -e $job_err -o $job_out --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $up_peaks $g data/homer/$contrast-up -size 200 > $up_log 2>&1" & fi if [ "$dn_peaks_count" -lt 10 ]; then echo "ERROR: $dn_peaks had less than 10 DE intervals." else echo "Running HOMER for $dn_peaks_count up peaks in $dn_peaks" - sbatch --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $dn_peaks $g data/homer/$contrast-down -size 200 > $dn_log 2>&1" & + job_out="jobs/out/homer-$contrast.out" + job_err="jobs/error/homer-$contrast.err" + sbatch -e $job_err -o $job_out --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $dn_peaks $g data/homer/$contrast-down -size 200 > $dn_log 2>&1" & fi done < $i