diff --git a/docs/articles/bedpca.html b/docs/articles/bedpca.html index 7d2d5343..593de78f 100644 --- a/docs/articles/bedpca.html +++ b/docs/articles/bedpca.html @@ -1,12 +1,12 @@ - +
- + @@ -15,14 +15,24 @@library(bigsnpr)
## Loading required package: bigstatsr
library(ggplot2)
+## Warning: package 'ggplot2' was built under R version 4.2.3
Let us use a subsetted version of the 1000 Genomes project data we provide. Some quality control has already been done; otherwise, you can use snp_plinkQC()
.
bedfile <- download_1000G("data")
+Let us use a subsetted version of the 1000 Genomes project data we
+provide. Some quality control has already been done; otherwise, you can
+use snp_plinkQC()
.
bedfile <- download_1000G("tmp-data")
+## Creating directory "tmp-data" which didn't exist..
We then compute PCA without using the related individuals. Function bed_autoSVD()
should take care of Linkage Disequilibrium (LD).
We then compute PCA without using the related individuals. Function
+bed_autoSVD()
should take care of Linkage Disequilibrium
+(LD).
(obj.bed <- bed(bedfile))
-## A 'bed' object with 2490 samples and 1664852 variants.
+## A 'bed' object with 2490 samples and 1664852 variants.
ind.rel <- match(c(rel$IID1, rel$IID2), obj.bed$fam$sample.ID) # /!\ use $ID1 instead with old PLINK
ind.norel <- rows_along(obj.bed)[-ind.rel]
-obj.svd <- bed_autoSVD(obj.bed, ind.row = ind.norel, k = 20,
+obj.svd <- bed_autoSVD(obj.bed, ind.row = ind.norel, k = 20,
ncores = nb_cores())
-##
-## Phase of clumping (on MAC) at r^2 > 0.2.. keep 392665 variants.
-## Discarding 0 variant with MAC < 10.
-##
+## Discarding 118686 variants with MAC < 10 or MAF < 0.02.
+##
+## Phase of clumping (on MAC) at r^2 > 0.2.. keep 322962 variants.
+##
## Iteration 1:
## Computing SVD..
-## 679 outlier variants detected..
-## 7 long-range LD regions detected..
-##
+## 581 outlier variants detected..
+## 5 long-range LD regions detected..
+##
## Iteration 2:
## Computing SVD..
+## 5 outlier variants detected..
+## 0 long-range LD region detected..
+##
+## Iteration 3:
+## Computing SVD..
## 0 outlier variant detected..
-##
+##
## Converged!
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-## Warning: Removed 1 rows containing missing values (geom_bar).
-
+## Warning: Removed 1 row containing missing values or values outside the scale range
+## (`geom_bar()`).
+
plot_grid(plotlist = lapply(7:10, function(k) {
plot(obj.svd, type = "scores", scores = 2 * k - 1:0, coeff = 0.6) +
aes(color = S) +
- scale_colour_viridis_c()
+ scale_colour_viridis_c(direction = -1)
}), scale = 0.95)
-
+
plot_grid(plotlist = lapply(7:10, function(k) {
plot(obj.svd, type = "scores", scores = 2 * k - 1:0, coeff = 0.6) +
- aes(color = S > 0.5) + # threshold based on histogram
- scale_colour_viridis_d()
+ aes(color = S > 0.6) + # threshold based on histogram
+ scale_colour_viridis_d(direction = -1)
}), scale = 0.95)
-
+
We recompute PCA without outliers, starting with the previous set of variants kept (we can therefore skip the initial clumping step).
-ind.row <- ind.norel[S < 0.5]
+We recompute PCA without outliers, starting with the previous set of
+variants kept (we can therefore skip the initial clumping step).
+ind.row <- ind.norel[S < 0.6]
ind.col <- attr(obj.svd, "subset")
-obj.svd2 <- bed_autoSVD(obj.bed, ind.row = ind.row,
+obj.svd2 <- bed_autoSVD(obj.bed, ind.row = ind.row,
ind.col = ind.col, thr.r2 = NA,
k = 20, ncores = nb_cores())
-##
+## Discarding 61 variants with MAC < 10 or MAF < 0.02.
+##
## Skipping clumping.
-## Discarding 0 variant with MAC < 10.
-##
+##
## Iteration 1:
## Computing SVD..
-## 22 outlier variants detected..
-## 1 long-range LD region detected..
-##
-## Iteration 2:
-## Computing SVD..
## 0 outlier variant detected..
-##
+##
## Converged!
plot(obj.svd2)
-
+
plot(obj.svd2, type = "loadings", loadings = 1:20, coeff = 0.4)
-
+
plot(obj.svd2, type = "scores", scores = 1:20, coeff = 0.4)
-
+
PCs <- matrix(NA, nrow(obj.bed), ncol(obj.svd2$u))
PCs[ind.row, ] <- predict(obj.svd2)
-proj <- bed_projectSelfPCA(obj.svd2, obj.bed,
+proj <- bed_projectSelfPCA(obj.svd2, obj.bed,
ind.row = rows_along(obj.bed)[-ind.row],
ncores = 1) # useless -> too few individuals
PCs[-ind.row, ] <- proj$OADP_proj
plot(PCs[ind.row, 7:8], pch = 20, xlab = "PC7", ylab = "PC8")
points(PCs[-ind.row, 7:8], pch = 20, col = "blue")
-
+
library(bigsnpr)
+## Loading required package: bigstatsr
+library(ggplot2)
+## Warning: package 'ggplot2' was built under R version 4.2.3
+Let us use a subsetted version of the 1000 Genomes project data we
+provide. Some quality control has already been done; otherwise, you can
+use snp_plinkQC()
.
bedfile <- download_1000G("tmp-data")
+## Creating directory "tmp-data" which didn't exist..
+We then compute PCA without using the related individuals. Function
+bed_autoSVD()
should take care of Linkage Disequilibrium
+(LD).
(obj.bed <- bed(bedfile))
+## A 'bed' object with 2490 samples and 1664852 variants.
+ind.rel <- match(c(rel$IID1, rel$IID2), obj.bed$fam$sample.ID) # /!\ use $ID1 instead with old PLINK
+ind.norel <- rows_along(obj.bed)[-ind.rel]
+
+obj.svd <- bed_autoSVD(obj.bed, ind.row = ind.norel, k = 20,
+ ncores = nb_cores())
+## Discarding 118686 variants with MAC < 10 or MAF < 0.02.
+##
+## Phase of clumping (on MAC) at r^2 > 0.2.. keep 322962 variants.
+##
+## Iteration 1:
+## Computing SVD..
+## 581 outlier variants detected..
+## 5 long-range LD regions detected..
+##
+## Iteration 2:
+## Computing SVD..
+## 5 outlier variants detected..
+## 0 long-range LD region detected..
+##
+## Iteration 3:
+## Computing SVD..
+## 0 outlier variant detected..
+##
+## Converged!
+Then, we look at if there are individual outliers.
+prob <- bigutilsr::prob_dist(obj.svd$u, ncores = nb_cores())
+S <- prob$dist.self / sqrt(prob$dist.nn)
+
+ggplot() +
+ geom_histogram(aes(S), color = "#000000", fill = "#000000", alpha = 0.5) +
+ scale_x_continuous(breaks = 0:5 / 5, limits = c(0, NA)) +
+ scale_y_sqrt(breaks = c(10, 100, 500)) +
+ theme_bigstatsr() +
+ labs(x = "Statistic of outlierness", y = "Frequency (sqrt-scale)")
+## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
+## Warning: Removed 1 row containing missing values or values outside the scale range
+## (`geom_bar()`).
+
+plot_grid(plotlist = lapply(7:10, function(k) {
+ plot(obj.svd, type = "scores", scores = 2 * k - 1:0, coeff = 0.6) +
+ aes(color = S) +
+ scale_colour_viridis_c(direction = -1)
+}), scale = 0.95)
+
+plot_grid(plotlist = lapply(7:10, function(k) {
+ plot(obj.svd, type = "scores", scores = 2 * k - 1:0, coeff = 0.6) +
+ aes(color = S > 0.6) + # threshold based on histogram
+ scale_colour_viridis_d(direction = -1)
+}), scale = 0.95)
+
+We recompute PCA without outliers, starting with the previous set of +variants kept (we can therefore skip the initial clumping step).
+ind.row <- ind.norel[S < 0.6]
+ind.col <- attr(obj.svd, "subset")
+obj.svd2 <- bed_autoSVD(obj.bed, ind.row = ind.row,
+ ind.col = ind.col, thr.r2 = NA,
+ k = 20, ncores = nb_cores())
+## Discarding 61 variants with MAC < 10 or MAF < 0.02.
+##
+## Skipping clumping.
+##
+## Iteration 1:
+## Computing SVD..
+## 0 outlier variant detected..
+##
+## Converged!
+plot(obj.svd2)
+
+plot(obj.svd2, type = "loadings", loadings = 1:20, coeff = 0.4)
+
+plot(obj.svd2, type = "scores", scores = 1:20, coeff = 0.4)
+
+PCs <- matrix(NA, nrow(obj.bed), ncol(obj.svd2$u))
+PCs[ind.row, ] <- predict(obj.svd2)
+
+proj <- bed_projectSelfPCA(obj.svd2, obj.bed,
+ ind.row = rows_along(obj.bed)[-ind.row],
+ ncores = 1) # useless -> too few individuals
+PCs[-ind.row, ] <- proj$OADP_proj
+plot(PCs[ind.row, 7:8], pch = 20, xlab = "PC7", ylab = "PC8")
+points(PCs[-ind.row, 7:8], pch = 20, col = "blue")
+
+