Skip to content

Commit

Permalink
update consensus peak
Browse files Browse the repository at this point in the history
adjust downstream rules after consensus peaks
  • Loading branch information
gartician committed Sep 2, 2021
1 parent 98bd521 commit 8c92e3c
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 81 deletions.
30 changes: 15 additions & 15 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ localrules: fraglength_plot, FRiP, counts_table, multiqc, homer

rule all:
input:
# quality control -----------------------------------------------------------------
# quality control
expand("data/fastp/{sample}_{read}.fastq.gz", sample = SAMPLES, read = ["R1", "R2"]),
expand("data/fastqc/{reads}_fastqc.html", reads = all_reads),
expand("data/fastq_screen/{sample}_{read}_screen.txt", sample = SAMPLES, read = ["R1", "R2"]),
Expand All @@ -42,14 +42,14 @@ rule all:
"data/multiqc/multiqc_report.html",
"data/fraglen.html",
"data/frip.html",
# read alignment ------------------------------------------------------------------
# read alignment
expand("data/banlist/{sample}.banlist.filtered.rmdup.sorted.bam", sample = SAMPLES),
expand("data/bigwig/{sample}.bw", sample = SAMPLES),
# peak calling --------------------------------------------------------------------
expand("data/macs2/{sample}/{sample}_peaks.broadPeak", sample = SAMPLES),
# peak calling
expand("data/macs2/{sample}_peaks.broadPeak", sample = SAMPLES),
"data/macs2/consensus_peaks.bed",
"data/counts/counts_table.txt",
# differential --------------------------------------------------------------------
# differential
"data/deseq2",
"data/diffbind",
"data/homer"
Expand Down Expand Up @@ -256,30 +256,29 @@ rule macs2:
bam = "data/banlist/{sample}.banlist.filtered.rmdup.sorted.bam",
bai = "data/banlist/{sample}.banlist.filtered.rmdup.sorted.bam.bai"
output:
"data/macs2/{sample}/{sample}_peaks.broadPeak"
"data/macs2/{sample}_peaks.broadPeak"
conda:
"envs/macs2.yaml"
params:
outdir = "data/macs2/{sample}",
genome_size = config["GSIZE"]
log:
"data/logs/macs2_{sample}.log"
shell:
"macs2 callpeak -t {input.bam} -n {wildcards.sample} "
"--format BAMPE --gsize {params.genome_size} "
"--outdir {params.outdir} --broad > {log} 2>&1"
"--format BAMPE --gsize {config[GSIZE]} "
"--outdir data/macs2 --broad > {log} 2>&1"

rule consensus:
input:
expand("data/macs2/{sample}/{sample}_peaks.broadPeak", sample = SAMPLES)
expand("data/macs2/{sample}_peaks.broadPeak", sample = SAMPLES)
output:
"data/macs2/consensus_peaks.bed"
params:
n_intersects = config["N_INTERSECTS"]
conda:
"envs/consensus.yaml"
script:
"scripts/consensus_peaks.R"
"envs/bedtools.yaml"
shell:
"bash scripts/consensus_peaks.sh {params.n_intersects} {input} > {output}"

rule counts:
input:
Expand All @@ -294,16 +293,17 @@ rule counts:

rule counts_table:
input:
expand("data/multicov/{sample}.txt", sample = SAMPLES)
expand("data/multicov/{sample}.txt", sample = sorted(SAMPLES))
output:
"data/counts/counts_table.txt"
run:
dfs = []
for file in list(input):
sample_name = os.path.basename(file).split('.')[0]
dfs.append( pd.read_csv(file, sep = "\t", names = ["chr", "start", "end", "name", sample_name]) )
dfs.append( pd.read_csv(file, sep = "\t", names = ["chr", "start", "end", sample_name]) )
df = pd.concat(dfs, axis = 1)
df = df.loc[:,~df.columns.duplicated()]
df.insert( 3, "name", ["peak" + str(i) for i in df.index] )
df.to_csv(str(output), header = True, index = False, sep = "\t")

# differential testing ----------------------------------------------------------------------------
Expand Down
60 changes: 0 additions & 60 deletions scripts/consensus_peaks.R

This file was deleted.

32 changes: 32 additions & 0 deletions scripts/consensus_peaks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

# import + check args
n_intersects=$1
all_inputs=$(echo "${@:2}")

if ! [[ "$n_intersects" =~ ^[0-9]+$ ]]; then
echo "First argument must be an integer. Your input: $n_intersects"
exit 1
fi

all_conditions=$(echo $all_inputs | tr ' ' '\n' | cut -d/ -f3 | cut -d_ -f1 | sort | uniq)

for condition in $all_conditions; do

# file I/O
tmp_output="data/macs2/$condition.tmp.bed"

# list all replicates in one condition
all_replicates=$(find data/macs2/ -name "*$condition*.broadPeak" | sort | tr '\n' ' ')

# find widest peak that appear in at least n replicates + export to tmp file.
cat $all_replicates | cut -f1-3 | sort -k1,1 -k2,2n | bedtools merge | \
bedtools intersect -c -a stdin -b $all_replicates | \
awk -v n=$n_intersects '$4 >= n' | awk -v OFS='\t' '{print $1,$2,$3}' > $tmp_output

done

# merge intervals for all tmp files and export as consensus peak.
all_temp_files=$(find data/macs2 -name "*.tmp.bed" | sort | tr '\n' ' ')
cat $all_temp_files | sort -k1,1 -k2,2n | bedtools merge
rm $all_temp_files
2 changes: 1 addition & 1 deletion scripts/deseq2.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,4 +135,4 @@ all_sig_intervals <- all_sig_intervals %>%
rename(name = "name")

write.table(de_stats, snakemake@output[['stats']], quote = FALSE, sep = "\t", row.names = F, col.names = T)
write.table(all_sig_intervals, snakemake@output[['all_sig_intervals']], quote = FALSE, sep = "\t", row.names = F, col.names = F)
write.table(all_sig_intervals, snakemake@output[['all_sig_intervals']], quote = FALSE, sep = "\t", row.names = F, col.names = F)
4 changes: 1 addition & 3 deletions scripts/diffbind.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ if (dir.exists("data/diffbind")) {
dir.create("data/diffbind")
}

consensus_peaks <- GRanges(read.table(snakemake@input[["consensus_peaks"]], col.names=c("seqnames", "start", "end", "name")))
consensus_peaks <- GRanges(read.table(snakemake@input[["consensus_peaks"]], col.names=c("seqnames", "start", "end")) %>% mutate(name = paste0("peak", row_number())) )

ATAC <-dba(sampleSheet=snakemake@input[["metadata"]])
ATAC <- dba.count(ATAC, peaks=consensus_peaks)
Expand All @@ -23,8 +23,6 @@ ATAC <- dba.analyze(ATAC)

# loop through all contrasts and export results
dba_meta <- dba.show(ATAC, bContrasts=TRUE)
dba_meta[,c("Group", "Group2")] %>%
write.table(., snakemake@output[["contrast_combinations"]], sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

# save.image()
for (i in 1:nrow(dba_meta)) {
Expand Down
9 changes: 7 additions & 2 deletions scripts/homer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ done
# reset current homer results
if [ -d "data/homer" ]; then
rm -r "data/homer"
mkdir "data/homer"
fi

# check if input files exists.
Expand Down Expand Up @@ -77,14 +78,18 @@ do
echo "ERROR: $up_peaks had less than 10 DE intervals."
else
echo "Running HOMER for $up_peaks_count up peaks in $up_peaks"
sbatch --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $up_peaks $g data/homer/$contrast-up -size 200 > $up_log 2>&1" &
job_out="jobs/out/homer-$contrast.out"
job_err="jobs/error/homer-$contrast.err"
sbatch -e $job_err -o $job_out --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $up_peaks $g data/homer/$contrast-up -size 200 > $up_log 2>&1" &
fi

if [ "$dn_peaks_count" -lt 10 ]; then
echo "ERROR: $dn_peaks had less than 10 DE intervals."
else
echo "Running HOMER for $dn_peaks_count up peaks in $dn_peaks"
sbatch --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $dn_peaks $g data/homer/$contrast-down -size 200 > $dn_log 2>&1" &
job_out="jobs/out/homer-$contrast.out"
job_err="jobs/error/homer-$contrast.err"
sbatch -e $job_err -o $job_out --job-name 'mm_donuts' --wait --wrap="findMotifsGenome.pl $dn_peaks $g data/homer/$contrast-down -size 200 > $dn_log 2>&1" &
fi

done < $i
Expand Down

0 comments on commit 8c92e3c

Please sign in to comment.