To use Cactus to create a genome wide alignment
conda activate repeatmasker
####################### L. var
nano repeatmodel_lvar.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
#SBATCH --mem 70000
cd /data/wraycompute/alejo/newgenomes
BuildDatabase -name Lvar Lvar.fasta # to build database
RepeatModeler -database Lvar -pa 23
# Run
sbatch repeatmodel_lvar.sh
####################### H. tub
nano repeatmodel_htub.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
#SBATCH --mem 35000
cd /data/wraycompute/alejo/newgenomes
BuildDatabase -name Htub Htub.fasta # to build database
RepeatModeler -database Htub -pa 23
sbatch repeatmodel_htub.sh
####################### H. ery
nano repeatmodel_hery.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
#SBATCH --mem 35000
cd /data/wraycompute/alejo/newgenomes
BuildDatabase -name Hery Hery.fasta # to build database
RepeatModeler -database Hery -pa 23
# run
sbatch repeatmodel_hery.sh
####################### E. luc
nano repeatmodel_eluc.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
#SBATCH --mem 35000
cd /data/wraycompute/alejo/newgenomes
BuildDatabase -name Eluc Eluc.fasta # to build database
RepeatModeler -database Eluc -pa 23
# run
sbatch repeatmodel_hery.sh
#############################
# to check the memory allowance
sacct -j ID --format=JobID,JobName,ReqMem,MaxRSS,Elapsed # RAM requested/used!!
####################### H. ery
mkdir Hery_mask_custom
nano repeatmasker_hery.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
cd /data/wraycompute/alejo/newgenomes
RepeatMasker -engine ncbi -pa 23 -s -lib Hery-families.fa -gff -dir Hery_mask_custom -xsmall Hery.fasta
# run
sbatch repeatmasker_hery.sh
####################### H. tub
mkdir Htub_mask_custom
nano repeatmasker_htub.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
cd /data/wraycompute/alejo/newgenomes
RepeatMasker -engine ncbi -pa 23 -s -lib Htub-families.fa -gff -dir Htub_mask_custom -xsmall Htub.fasta
# Run
sbatch repeatmasker_htub.sh
########################## L. var
mkdir Lvar_mask_custom
nano repeatmasker_lvar.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
cd /data/wraycompute/alejo/newgenomes
RepeatMasker -engine ncbi -pa 23 -s -lib Lvar-families.fa -gff -dir Lvar_mask_custom -xsmall Lvar.fasta
# Run
sbatch repeatmasker_lvar.sh
########################## E. luc
mkdir Eluc_mask_custom
nano repeatmasker_eluc.sh
#!/usr/bin/env bash
#SBATCH [email protected]
#SBATCH --mail-type=END,FAIL
cd /data/wraycompute/alejo/newgenomes
#SBATCH --mem 6G
#SBATCH --cpus-per-task=23
RepeatMasker -engine ncbi -pa 23 -s -lib Eluc-families.fa -gff -dir Eluc_mask_custom -xsmall Eluc.fasta
# Run
sbatch repeatmasker_eluc.sh
Create urchin_seqfile.txt file with masked genomes. The * simbolizes the genomes are reference quality
nano urchin_seqfile.txt
# Sequence data for progressive alignment of 3 genomes
# Lv, He and Ht are flagged as good assemblies.
# all will be used as an outgroup species.
(Lv:1,(He:0.2,Ht:0.2):0.8);
*He /data/wraycompute/alejo/PS_tests/Genome_alignments/masking_genomes/Hery.masked.fasta
*Ht /data/wraycompute/alejo/PS_tests/Genome_alignments/masking_genomes/Htub.masked.fasta
*Lv /data/wraycompute/alejo/PS_tests/Genome_alignments/masking_genomes/Lvar.masked.fasta
source cactus_env/bin/activate
mkdir urchins_wkdir
nano cactus_fastas2hal.sh
#!/usr/bin/env bash
#SBATCH -J cactus_alignment
#SBATCH --mail-type=END
#SBATCH [email protected]
#SBATCH -N 4
#SBATCH -c 24
#SBATCH --mem=40G
cactus --workDir urchins_wkdir/ --maxCores 96 --maxMemory 150G jobStore_urchin urchin_seqfile.txt urchins.hal --binariesMode local
sbatch cactus_fastas2hal.sh
Next, we need to convert HAL to MAF for each chromosome:
nano cactus_hal2maf.sh
#!/usr/bin/env bash
#SBATCH -J hal2maf
#SBATCH --mail-type=END
#SBATCH [email protected]
#SBATCH -N 1
#SBATCH --mem=4G
for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 ; do
hal2maf --noAncestors --noDupes urchins.hal $chr.urchins.maf --refGenome He --refSequence $chr
done
sbatch cactus_hal2maf.sh
To extract features, we need to extract fasta using msa_split but first we need to proceess a bed file into feature data per chromosome
To create features
mkdir features
for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 ;
do grep -w $chr Hery_adult_peaks.bed | awk '{print $1 "\t" $2 "\t" $3 }' | sort -k1,1 -k2,2 -V > features/$chr.feat.bed;
done
Use msa_split to extract fasta files using the genome of reference.
awk '/^>chr/ {OUT=substr($0,2) ".fa";print " ">OUT}; OUT{print >OUT}' Hery_genome.fasta.masked
# Obtain a genome size file
module load samtools
samtools faidx Hery_genome.fasta.masked
awk '{print $1 "\t" $2 }' Hery_genome.fasta.masked.fai > sizes.genome
mkdir query
nano maf2fasta.sh
#!/usr/bin/env bash
#SBATCH -J maf2fasta
#SBATCH --mail-type=END
#SBATCH [email protected]
#SBATCH -N 1
for file in chr*urchins.maf; do root=`basename $file .urchins.maf`; maf_parse $file --seqs Lv,Ht,He > $root.noAnc.maf; done
for chr in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21;
do msa_split $chr.noAnc.maf --refseq $chr.fa --gap-strip ANY -q --in-format MAF --features features/$chr.feat.bed --for-features --out-root query/$chr;
done
sbatch maf2fasta.sh