-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtutorials.prioritization.R
130 lines (106 loc) · 5.8 KB
/
tutorials.prioritization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
###############################################################################
## Tutorial for CCM bioinformatics series, The Hospital for Sick Children, 2019-03-20
###############################################################################
# use select and then Ctrl + Enter to source a function
filter_variants <- function(variants, proband, mom, dad){
variants_filtered <- filter(variants, Zygosity.Ashkenazim_HG002 == proband,
Zygosity.Ashkenazim_HG003 == mom,
Zygosity.Ashkenazim_HG004 == dad)
return(variants_filtered)
}
# installation
install.packages("tidyverse")
install.packages("googledrive")
library(tidyverse)
setwd("~/Downloads/2019_03_variant_prioritization/")
# download ashkenazim.small_variants.csv
# https://drive.google.com/open?id=15bYaktk50qD6qeL-LPgm5NSlk4-XFfYq
library(googledrive)
drive_find(n_max = 50, type = "csv")
drive_download("~/tutorials/2019_03_variant_prioritization/ashkenazim.small_variants.csv")
# assignment <- Alt -
id <- "15bYaktk50qD6qeL-LPgm5NSlk4-XFfYq"
# download by ID - sometimes you have to do it several times if fails
drive_download(as_id(id), overwrite = T)
# download by URL
drive_download(as_id("https://drive.google.com/open?id=15bYaktk50qD6qeL-LPgm5NSlk4-XFfYq"), overwrite = T)
# some values are None in Quality and it reads as character string rather than numerical value
variants <- read_csv("ashkenazim.small_variants.csv", na = c("None"))
spec(variants)
# remove low quality variants supported by 2 variant callers
variants <- filter(variants, Number_of_callers > 2)
# do it in excel first
variants_rare_missense <- filter(variants, Variation == "missense_variant", Maf_all < 0.01)
variants_potentially_deleterious <- filter(variants_rare_missense, Polyphen_score > 0.9)
# explore variation
# pipe = Ctrl + Shift + M
variation <- variants %>% count(Variation)
# equivalent expression
# count(variants, Variation)
ggplot(data = variants) + geom_bar(mapping = aes(x = Variation)) + coord_flip()
variants_recessive <- filter(variants, Zygosity.Ashkenazim_HG002 == "Hom")
variants_recessive <- filter(variants, Zygosity.Ashkenazim_HG002 == "Hom",
Zygosity.Ashkenazim_HG003 == "Het",
Zygosity.Ashkenazim_HG004 == "Het",
Omim_gene_description != "0")
write_excel_csv(variants_recessive, "variants_recessive.csv")
# example of how to create and use a function
# variants_recessive <- filter_variants(variants, "Hom", "Het", "Het") %>%
# filter(Omim_gene_description != "0")
# X linked
variants_x_linked <- filter_variants(variants, "Hom","-", "Het") %>%
filter(grepl("X",Omim_inheritance))
# variants reported in clinvar
clinvar <- variants %>% count(Clinvar)
# exersize : plot a bar chart
# pull only clinvar pathogenic variants
variants_clinvar_pathogenic <- filter(variants, Clinvar == "pathogenic")
# pull only variants with high CADD and polyphen scores
variants_computational <- filter(variants, Cadd_score > 30, Polyphen_score > 0.95,
Zygosity.Ashkenazim_HG002 != "-", Omim_gene_description != "0")
# none of them is homozygous!
variants_computational <- filter(variants, Cadd_score > 30, Polyphen_score > 0.95,
Zygosity.Ashkenazim_HG002 != "-", Omim_gene_description != "0",
Omim_inheritance = "AD")
# de novo
variants_de_novo <- filter_variants(variants, "Het", "-", "-")
variants_de_novo <- filter_variants(variants, "Het", "-", "-") %>% filter(Quality > 100)
# DNMT is a candate?
# save de novos
write_excel_csv(variants_de_novo, "variants_denovo.csv", append = F)
# Exercise: are there some more de novo variants?
variants_de_novo <- filter_variants(variants, "Hom", "Het", "-") %>% filter(Quality > 100)
# variants_de_novo <- filter_variants(variants, "Hom", "-", "Het") %>% filter(Quality > 100)
write_excel_csv(variants_de_novo, "variants_denovo.csv", append = T)
# review the file - why so many de novo variants on chrX?
# is proband a male? - search for Ashkenazim trio
# (son - father - mother)
variants_chrx <- filter(variants, grepl("chrX", Position), Number_of_callers > 2)
# has a paralogue in chrY pseudoatosomal region
# https://www.ncbi.nlm.nih.gov/gene/414
variants_chrx <- filter(variants_chrx, Gene != "ARSD")
# what is wrong with ESX1 gene?
# low alt depth - false positive
# Het in ASMTL? - pseudoatosomal region (<2.7M)
# https://www.ncbi.nlm.nih.gov/grc/human
variants_chrx <- filter(variants_chrx, Gene != "ESX1")
# all these variants are not de novos!
# select a subset of columns
colnames(variants)
variants_de_novo_report <- select(variants_de_novo, c("Position", "Ref", "Alt", "Gene", "Variation",
"Gnomad_maf", "rsIDs"))
# exercise : select columns you'd want to report
# TCAG small variant report for WGS for NA12878
# https://drive.google.com/open?id=1RATcIzqoptbyfSoJHTxAUoHT23wCovcz
drive_download(as_id("https://drive.google.com/open?id=1RATcIzqoptbyfSoJHTxAUoHT23wCovcz"), overwrite = T)
wgs_small_variants_tcag <- read_tsv("NA12878.hc.vqsr.vcf_ext.tsv.annovar.out_SUBSET_rev26.5.chr22.tsv")
# pull variants matching HGMD database
variants_hgmd <- filter(wgs_small_variants_tcag, grepl("DM",HGMD_tag))
# exercise: choose your strategy on how to extract pathogenic variants, filter + select with pipe %>%
# structural variant report - for muscular genes
drive_download(as_id("https://drive.google.com/open?id=1O2RtjYbP2kyUxd-VAYhb3WPhgilJOmhj"), overwrite = T)
variants_sv <- read_csv("NA12878.sv.csv")
# sort by lengths decreasing
variants_sv <- arrange(variants_sv, SVLEN)
drive_download(as_id("https://drive.google.com/open?id=1WYPGlOMPvxITzKJg6YlWGajNfu92yyIP"), overwrite = T)
drive_download(as_id("https://drive.google.com/open?id=1WED_JAZJvRFKB3h7MqXSSfjqVT_1Ficr"), overwrite = T)