Skip to content

Latest commit

 

History

History
291 lines (137 loc) · 6.64 KB

docker_instructions.md

File metadata and controls

291 lines (137 loc) · 6.64 KB

Docker Instructions

If you have a working installation of Docker and want to run small jobs in a contained working environment.

First, sign in and pull the image from docker hub

docker login --username=yourusername
docker pull wodenaz/accelerated-evolution-version3:latest

Next, make sure you see the image in your profile

docker image ls

Finally, enter into the image environment:

docker run -it wodenaz/accelerated-evolution-version3

Navigate trough the file system and you can even run commands:

bash dobf.sh

#############################################################################wodenaz/accelerated-evolution-version3#############################################
# 2.2. Create lists of files for each model and species


###
# 2.2.A. Create a list of files to apply LRT method


for file in chr*bf ; do echo $file >> hyphy.list; done


### run HYPHY

for file in `cat hyphy.list`; do root=`basename $file .bf`; HYPHYMP $file > HYPHY/$root.out;done


###
# 2.2.B. or, create a list of batch files per chromosome to run in different nodes

for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX chrY; do
	echo $chr.*hg19.null.bf >> $chr.null.hg19.list;
	echo $chr.*hg19.alt.bf >> $chr.alt.hg19.list;
	echo $chr.*panTro4.null.bf >> $chr.null.panTro4.list;
	echo $chr.*panTro4.alt.bf >> $chr.alt.panTro4.list;	
done


##########################################################################################################################
# 2.3. Create launcher files for each chromosome to run batches in parallel


		
python shgenerator.py


### Run HYPHY in parallel launchers

for file in chr20*sh ; do bash $file ; done


#########################################

###########################################3
# Step 3


# Once the HYPHY runs have finished, we need to create a table with pvalue and zeta

cd res


for file in *hg19.null.res; do echo $file >> null.hg19.log; done;
for file in *hg19.alt.res; do echo $file >>alt.hg19.log; done;
for file in *panTro4.null.res; do echo $file >>null.panTro4.log; done;
for file in *panTro4.alt.res; do echo $file >> alt.panTro4.log; done;




nano domodelX.sh
#!/usr/bin/env bash
for filename in `cat alt.panTro4.log`; do grep -H "BEST LOG-L:" $filename >> alt.panTro4.tab; done;
for filename in `cat null.panTro4.log`; do grep -H "BEST LOG-L:" $filename >> null.panTro4.tab; done;
for filename in `cat alt.hg19.log`; do grep -H "BEST LOG-L:" $filename >> alt.hg19.tab; done;
for filename in `cat null.hg19.log`; do grep -H "BEST LOG-L:" $filename >> null.hg19.tab; done;


bash domodelX.sh

#########




# Consolidating tables for 5 branches
awk '{print $1 "\t" $3}' null.hg19.tab | sort -k1,1 -V  > nulls.hg19.tab
awk '{print $1 "\t" $3}' alt.hg19.tab | sort -k1,1 -V > alts.hg19.tab

awk '{print $1 "\t" $3}' null.panTro4.tab | sort -k1,1 -V  > nulls.panTro4.tab
awk '{print $1 "\t" $3}' alt.panTro4.tab | sort -k1,1 -V > alts.panTro4.tab


#####
# 3 COLUMNS of data, I can pull down other parameteres later
awk -F"." '{print $1 ":" $2 "\t" $3 }'  nulls.hg19.tab > col1.hg19.tab
paste col1.hg19.tab nulls.hg19.tab  | awk '{print $1 "\t" $2 "\t" $4   }' > lnulls.hg19.tab
paste lnulls.hg19.tab alts.hg19.tab | awk '{print $1 "\t" $2  "\t" $3 "\t" $5  }' |  sort -k1,1 -V > likelihoods.hg19.tab
 
awk -F"." '{print $1 ":" $2 "\t" $3 }'  nulls.panTro4.tab > col1.panTro4.tab
paste col1.panTro4.tab nulls.panTro4.tab  | awk '{print $1 "\t" $2 "\t" $4   }'> lnulls.panTro4.tab
paste lnulls.panTro4.tab alts.panTro4.tab | awk '{print $1 "\t" $2  "\t" $3 "\t" $5  }' |  sort -k1,1 -V > likelihoods.panTro4.tab
 


cat likelihoods.hg19.tab likelihoods.panTro4.tab  |  sort -k1,1 -V | sed 1i"chromosome\tbranch\tlnull\tlalt" > likelihoods.tab



# to go to R:


nano LRT.R

likelihoods = read.table("likelihoods.tab", header = TRUE)  # read tab file
LRT <- -2*(likelihoods$lnull - likelihoods$lalt)
pval <- 1-pchisq(LRT, 1)
l_pvals <- cbind(likelihoods, pval, -log(pval))
write.table(l_pvals, file ="likelihoods.pvals.tab", row.names=F, col.names=F, quote=F) 



RScript LRT.R



#####
# To compute zeta, we need to run PhyloP from PHAST. 

# if the list of prunned alignments is already created in reference and query directories. please run:

bash domodel.sh 



cd ..
cd ..

cp query/MODELS_HKY85/Q.hky85.Branches.tab .
cp ref/MODELS_HKY85/R.hky85.Branches.tab .


wc -l Q.hky85.Branches.tab
wc -l R.hky85.Branches.tab



sed -i -e "1d" Q.hky85.Branches.tab
sed -i -e "1d" R.hky85.Branches.tab



sort Q.hky85.Branches.tab -k1,1 -V > Q.hky85.Branches.sorted.tab
sort R.hky85.Branches.tab -k1,1 -V > R.hky85.Branches.sorted.tab




####


nano zeta_cal.R

rate_Q = as.data.frame(read.table("Q.hky85.Branches.sorted.tab", header = F)) # read tab file 
#colnames(fibroblast_data) <- c('chromosome', 'pval.human', 'zeta.human', 'pval.chimp', 'zeta.chimp','phastcons')
rate_R= as.data.frame(read.table("R.hky85.Branches.sorted.tab", header = F)) # read tab file 
colnames(rate_Q) 
colnames(rate_R) 

zeta <- merge(rate_Q, rate_R, by= "V1")

rate1 <- zeta$V6.x / zeta$V6.y # human
rate2 <- zeta$V5.x / zeta$V5.y # chimp
#rate3 <- fibroblast.zeta$V7.x / fibroblast.zeta$V7.y # node6
#rate4 <- fibroblast.zeta$V4.x / fibroblast.zeta$V4.y # gorilla
#rate5 <- fibroblast.zeta$V8.x / fibroblast.zeta$V8.y # node4
#rate6 <- fibroblast.zeta$V3.x / fibroblast.zeta$V3.y # orangutan


ncHAE_zeta <- data.frame(zeta$V1 , rate1, rate2)
colnames(ncHAE_zeta) <- c('chromosome', 'zeta.human', 'zeta.chimp')


write.table(ncHAE_zeta, file ="PhyloFit.ncHAE.data", row.names=F, col.names=T, quote=F) 



### 

Rscript zeta_cal.R



####



awk '{ print $1 "\t" $2 "\t" $5 }' likelihoods.pvals.tab | grep 'hg19' > ncHAE.human.data
awk '{ print $1 "\t" $2 "\t" $5 }' likelihoods.pvals.tab | grep 'panTro4' > ncHAE.chimp.data



####

R



human_1 = as.data.frame(read.table("ncHAE.human.data", header = F)) # read tab file 
chimp_2 = as.data.frame(read.table("ncHAE.chimp.data", header = F)) # read tab file


 
head(human_1)
head(chimp_2)


human_1$V2 <- NULL

chimp_2$V2 <- NULL



ncHAE.selection.data1 <- merge(human_1, chimp_2, by= "V1")


colnames(ncHAE.selection.data1) <-  c('chromosome', 'pval.human', 'pval.chimp')


rates = as.data.frame(read.table("PhyloFit.ncHAE.data", header = T)) # read tab file   
head(rates)
dim(rates)

ncHAE.selection.data <- merge(ncHAE.selection.data1, rates, by= c("chromosome"))

write.table(ncHAE.selection.data, file ="ncHAE.selection.data", row.names=F, col.names=T, quote=F) 



#q()


# FLAGGING WITH THE GENOME BLACK LIST 


Finally, exit close Docker and delete images if necessary

exit

docker rm -f $(docker ps -aq)
docker system prune -a