-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFASTQ_Me.sh
123 lines (104 loc) · 3.14 KB
/
FASTQ_Me.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/bin/bash
# Arguments
project=Pilot
#https://bioinformatics.ucdavis.edu/research-computing/documentation/archiving-slims-data/
SLIMSstring=gz3bl6zibe
SLIMSdir=Un_DTDB126/Project_JLBL_Nova150P_Laufer
genome=hg38
programsPATH=/share/lasallelab/programs
echo "Creating directory for $project"
mkdir ${project}
echo "Downloading fastq files for $SLIMSstring"
call="rsync \
-avL \
slimsdata.genomecenter.ucdavis.edu::slims/${SLIMSstring}/ \
${project}"
echo ${call}
eval ${call}
echo "md5sum check"
cd ${project}/${SLIMSdir}
if md5sum -c \@md5Sum.md5
then
echo
echo "All files have the correct md5sum"
else
echo "ERROR: Some files are corrupt or missing"
exit 1
fi
echo "Merging lanes"
echo "Moving undetermined files"
mkdir Other
mv Undetermined* Other
echo "Checking for the right number of unique sample IDs for both R1 and R2"
countFASTQ(){
awk -F '_' '{print $1}' | \
sort -u | \
wc -l
}
export -f countFASTQ
R1=`ls -1 *R1*.gz | countFASTQ`
R2=`ls -1 *R2*.gz | countFASTQ`
if [ ${R1} = ${R2} ]
then
lanes=`ls -1 *R1*.gz | \
awk -F '_' '{print $1}' | \
sort | \
uniq -c | \
awk -F ' ' '{print $1}' | \
sort -u`
echo "${R1} samples sequenced across ${lanes} lanes identified for merging"
else
echo "ERROR: There are ${R1} R1 files and ${R2} R2 files"
exit 1
fi
echo "Creating a file of unique IDs based on first string from underscore delimiter"
ls -1 *fastq.gz | \
awk -F '_' '{print $1}' | \
sort -u > \
task_samples.txt
echo "Printing merge commands"
mergeLanesTest(){
i=$1
echo cat ${i}\_*_R1_001.fastq.gz \> ${i}\_1.fq.gz
echo cat ${i}\_*_R2_001.fastq.gz \> ${i}\_2.fq.gz
}
export -f mergeLanesTest
cat task_samples.txt | parallel --jobs 6 --will-cite mergeLanesTest
echo "Merging ${lanes} lanes for ${R1} samples"
mergeLanes(){
i=$1
cat ${i}\_*_R1_001.fastq.gz > ${i}\_1.fq.gz
cat ${i}\_*_R2_001.fastq.gz > ${i}\_2.fq.gz
}
export -f mergeLanes
cat task_samples.txt | parallel --jobs 6 --verbose --will-cite mergeLanes
# Concise merging calls but doesn't make it easy for those who don't use GNU parallel
#parallel --jobs 6 "echo cat {}_*_R1_001.fastq.gz \> {}\_1.fq.gz" ::: `cat task_samples.txt`
#parallel --jobs 6 "echo cat {}_*_R2_001.fastq.gz \> {}\_2.fq.gz" ::: `cat task_samples.txt`
#parallel --jobs 6 --verbose "cat {}_"\*"_R1_001.fastq.gz > {}_1.fq.gz" ::: `cat task_samples.txt`
#parallel --jobs 6 --verbose "cat {}_"\*"_R2_001.fastq.gz > {}_2.fq.gz" ::: `cat task_samples.txt`
echo "Creating directories for CpG_Me"
mkdir ../../raw_sequences
mv task_samples.txt ../..
mv *.fq.gz ../../raw_sequences
echo "Checking read pairs"
cd ../../raw_sequences
pairedReads=$(($(ls | wc -l)/2))
if [ ${pairedReads} = ${R1} ]
then
echo "${pairedReads} will be aligned using CpG_Me"
else
echo "ERROR: Incorrect number of merged read pairs in raw_sequences"
echo "There are ${pairedReads} but there should be ${R1}"
exit 1
fi
echo "Submitting $genome alignment command to cluster for ${project}"
cd ..
call="sbatch \
--array=1-${pairedReads} \
${programsPATH}/CpG_Me/Paired-end/CpG_Me_PE_controller.sh \
${genome}"
echo ${call}
eval ${call}
echo "Done"
exit 0