output | permalink | title | header1 | header2 | image | home |
---|---|---|---|---|---|---|
html_document |
/scRNA_2023_module1 |
Single Cell RNA-seq Module 1 Lab Answers |
Workshop Pages for Students |
Prework Single-Cell RNA-seq 2023 |
/site_images/CBW_sc-rna-seq.jpg |
// Only run this code in IE 8 if (!!window.navigator.userAgent.match("MSIE 8")) { !function(a){"use strict";a.matchMedia=a.matchMedia||function(a){var b,c=a.documentElement,d=c.firstElementChild||c.firstChild,e=a.createElement("body"),f=a.createElement("div");return f.id="mq-test-1",f.style.cssText="position:absolute;top:-100em",e.style.background="none",e.appendChild(f),function(a){return f.innerHTML='<style media="'+a+'"> #mq-test-1 { width: 42px; }</style>',c.insertBefore(e,d),b=42===f.offsetWidth,c.removeChild(e),{matches:b,media:a}}}(a.document)}(this),function(a){"use strict";function b(){u(!0)}var c={};a.respond=c,c.update=function(){};var d=[],e=function(){var b=!1;try{b=new a.XMLHttpRequest}catch(c){b=new a.ActiveXObject("Microsoft.XMLHTTP")}return function(){return b}}(),f=function(a,b){var c=e();c&&(c.open("GET",a,!0),c.onreadystatechange=function(){4!==c.readyState||200!==c.status&&304!==c.status||b(c.responseText)},4!==c.readyState&&c.send(null))};if(c.ajax=f,c.queue=d,c.regex={media:/@media[^\{]+{([^\{\}]{[^\}\{]})+/gi,keyframes:/@(?:-(?:o|moz|webkit)-)?keyframes[^\{]+{(?:[^\{\}]{[^\}\{]})+[^\}]}/gi,urls:/(url()['"]?([^\/\)'"][^:\)'"]+)['"]?())/g,findStyles:/@media ([^\{]+){([\S\s]+?)$/,only:/(only\s+)?([a-zA-Z]+)\s?/,minw:/([\s]min-width\s:[\s]([\s][0-9.]+)(px|em)[\s])/,maxw:/([\s]max-width\s:[\s]([\s][0-9.]+)(px|em)[\s])/},c.mediaQueriesSupported=a.matchMedia&&null!==a.matchMedia("only all")&&a.matchMedia("only all").matches,!c.mediaQueriesSupported){var g,h,i,j=a.document,k=j.documentElement,l=[],m=[],n=[],o={},p=30,q=j.getElementsByTagName("head")[0]||k,r=j.getElementsByTagName("base")[0],s=q.getElementsByTagName("link"),t=function(){var a,b=j.createElement("div"),c=j.body,d=k.style.fontSize,e=c&&c.style.fontSize,f=!1;return b.style.cssText="position:absolute;font-size:1em;width:1em",c||(c=f=j.createElement("body"),c.style.background="none"),k.style.fontSize="100%",c.style.fontSize="100%",c.appendChild(b),f&&k.insertBefore(c,k.firstChild),a=b.offsetWidth,f?k.removeChild(c):c.removeChild(b),k.style.fontSize=d,e&&(c.style.fontSize=e),a=i=parseFloat(a)},u=function(b){var c="clientWidth",d=k[c],e="CSS1Compat"===j.compatMode&&d||j.body[c]||d,f={},o=s[s.length-1],r=(new Date).getTime();if(b&&g&&p>r-g)return a.clearTimeout(h),h=a.setTimeout(u,p),void 0;g=r;for(var v in l)if(l.hasOwnProperty(v)){var w=l[v],x=w.minw,y=w.maxw,z=null===x,A=null===y,B="em";x&&(x=parseFloat(x)(x.indexOf(B)>-1?i||t():1)),y&&(y=parseFloat(y)(y.indexOf(B)>-1?i||t():1)),w.hasquery&&(z&&A||!(z||e>=x)||!(A||y>=e))||(f[w.media]||(f[w.media]=[]),f[w.media].push(m[w.rules]))}for(var C in n)n.hasOwnProperty(C)&&n[C]&&n[C].parentNode===q&&q.removeChild(n[C]);n.length=0;for(var D in f)if(f.hasOwnProperty(D)){var E=j.createElement("style"),F=f[D].join("\n");E.type="text/css",E.media=D,q.insertBefore(E,o.nextSibling),E.styleSheet?E.styleSheet.cssText=F:E.appendChild(j.createTextNode(F)),n.push(E)}},v=function(a,b,d){var e=a.replace(c.regex.keyframes,"").match(c.regex.media),f=e&&e.length||0;b=b.substring(0,b.lastIndexOf("/"));var g=function(a){return a.replace(c.regex.urls,"$1"+b+"$2$3")},h=!f&&d;b.length&&(b+="/"),h&&(f=1);for(var i=0;f>i;i++){var j,k,n,o;h?(j=d,m.push(g(a))):(j=e[i].match(c.regex.findStyles)&&RegExp.$1,m.push(RegExp.$2&&g(RegExp.$2))),n=j.split(","),o=n.length;for(var p=0;o>p;p++)k=n[p],l.push({media:k.split("(")[0].match(c.regex.only)&&RegExp.$2||"all",rules:m.length-1,hasquery:k.indexOf("(")>-1,minw:k.match(c.regex.minw)&&parseFloat(RegExp.$1)+(RegExp.$2||""),maxw:k.match(c.regex.maxw)&&parseFloat(RegExp.$1)+(RegExp.$2||"")})}u()},w=function(){if(d.length){var b=d.shift();f(b.href,function(c){v(c,b.href,b.media),o[b.href]=!0,a.setTimeout(function(){w()},0)})}},x=function(){for(var b=0;b<s.length;b++){var c=s[b],e=c.href,f=c.media,g=c.rel&&"stylesheet"===c.rel.toLowerCase();e&&g&&!o[e]&&(c.styleSheet&&c.styleSheet.rawCssText?(v(c.styleSheet.rawCssText,e,f),o[e]=!0):(!/^([a-zA-Z:]*//)/.test(e)&&!r||e.replace(RegExp.$1,"").split("/")[0]===a.location.host)&&("//"===e.substring(0,2)&&(e=a.location.protocol+e),d.push({href:e,media:f})))}w()};x(),c.update=x,c.getEmValue=t,a.addEventListener?a.addEventListener("resize",b,!1):a.attachEvent&&a.attachEvent("onresize",b)}}(this); }; </script>
<style>h1 {font-size: 34px;} h1.title {font-size: 38px;} h2 {font-size: 30px;} h3 {font-size: 24px;} h4 {font-size: 18px;} h5 {font-size: 16px;} h6 {font-size: 12px;} code {color: inherit; background-color: rgba(0, 0, 0, 0.04);} pre:not([class]) { background-color: white }</style> <script> /** * jQuery Plugin: Sticky Tabs * * @author Aidan Lister * adapted by Ruben Arslan to activate parent tabs too * http://www.aidanlister.com/2014/03/persisting-the-tab-state-in-bootstrap/ */ (function($) { "use strict"; $.fn.rmarkdownStickyTabs = function() { var context = this; // Show the tab corresponding with the hash in the URL, or the first tab var showStuffFromHash = function() { var hash = window.location.hash; var selector = hash ? 'a[href="' + hash + '"]' : 'li.active > a'; var $selector = $(selector, context); if($selector.data('toggle') === "tab") { $selector.tab('show'); // walk up the ancestors of this element, show any hidden tabs $selector.parents('.section.tabset').each(function(i, elm) { var link = $('a[href="#' + $(elm).attr('id') + '"]'); if(link.data('toggle') === "tab") { link.tab("show"); } }); } }; // Set the correct tab when the page loads showStuffFromHash(context); // Set the correct tab when a user uses their back/forward button $(window).on('hashchange', function() { showStuffFromHash(context); }); // Change the URL when tabs are clicked $('a', context).on('click', function(e) { history.pushState(null, null, this.href); showStuffFromHash(context); }); return this; }; }(jQuery)); window.buildTabsets = function(tocID) { // build a tabset from a section div with the .tabset class function buildTabset(tabset) { // check for fade and pills options var fade = tabset.hasClass("tabset-fade"); var pills = tabset.hasClass("tabset-pills"); var navClass = pills ? "nav-pills" : "nav-tabs"; // determine the heading level of the tabset and tabs var match = tabset.attr('class').match(/level(\d) /); if (match === null) return; var tabsetLevel = Number(match[1]); var tabLevel = tabsetLevel + 1; // find all subheadings immediately below var tabs = tabset.find("div.section.level" + tabLevel); if (!tabs.length) return; // create tablist and tab-content elements var tabList = $('## Loading required package: Seurat
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
## Attaching SeuratObject
## Loading required package: DropletUtils
## Loading required package: SingleCellExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
There are four basic data structures used in R: - vector : a 1D ordered set of 1 or more values that are all the same type - matrix : a 2D grid of values that are all the same type - data.frame : a 2D grid of values where each column can store data of a different type. - list : a 1D ordered set of 1 or more values where each value can be a different type.
x = c(1,-5,2,10,7) # vector of numbers
x[2]
## [1] -5
y = c("canadian", "bioinformatics", "workshop", "single", "cell") # vector of strings
y[1]
## [1] "canadian"
We can combine the vectors above into a matrix.
mat1 = cbind(x,x)
mat1
## x x
## [1,] 1 1
## [2,] -5 -5
## [3,] 2 2
## [4,] 10 10
## [5,] 7 7
However, if we try to mix data-types R will “coerce” them into the same type. In this case strings, as you can see from the quotation marks that are now around the numbers.
mat2 = rbind(x,y)
mat2
## [,1] [,2] [,3] [,4] [,5]
## x "1" "-5" "2" "10" "7"
## y "canadian" "bioinformatics" "workshop" "single" "cell"
To properly store numbers and text together we want a data.frame:
df = data.frame(num=x, text=y)
df
## num text
## 1 1 canadian
## 2 -5 bioinformatics
## 3 2 workshop
## 4 10 single
## 5 7 cell
If we want to store matrices, dataframes, and vectors together we can use a list:
lt = list(mat=mat2, vec=x, df=df)
lt
## $mat
## [,1] [,2] [,3] [,4] [,5]
## x "1" "-5" "2" "10" "7"
## y "canadian" "bioinformatics" "workshop" "single" "cell"
##
## $vec
## [1] 1 -5 2 10 7
##
## $df
## num text
## 1 1 canadian
## 2 -5 bioinformatics
## 3 2 workshop
## 4 10 single
## 5 7 cell
To access data within these different objects we use square brackets with the “index” i.e. the item number or row+column number for the data we want to access. If our data is named we can also use the name to access it.
# the "vec" in our list can be accessed in all of these ways
lt[[2]]
## [1] 1 -5 2 10 7
lt[["vec"]]
## [1] 1 -5 2 10 7
lt$vec
## [1] 1 -5 2 10 7
# the column "num" in our dataframe can be accessed in all of these ways
df$num
## [1] 1 -5 2 10 7
df[,"num"]
## [1] 1 -5 2 10 7
df[,colnames(df) == "num"]
## [1] 1 -5 2 10 7
R “Objects” Single-cell RNAseq data involves many different data structures for different parts of the data. Metadata is generally a dataframe. Gene expression is stored as a matrix. Lower dimensional projections are paired matrices, and cell-cell graphs require special datastructures.
We could combine these together using a list but a list is so flexible when we give a list to a function it can be hard to find the right data in it. e.g. is the normalized data called “norm”, “lognorm”, “normdata”, “expr_mat” etc… To solve this problem we use “Objects” that act like a list but with pre-defined slots for specific types of data.
You can tell when you are dealing with an object by the “@” symbol in the Global Environment or when using “str()” on the variable. This “@” indicated a pre-defined slot that must always exist in the object with a specific name and may to limited to storing a particular type of data. E.g. a “obj@counts” slot likely must contain either a matrix or a sparse matrix.
Single cell data & analysis uses several distinct types of information, each of which can be stored in different ways: - genes x cells gene expression (numbers), this can be raw counts or after one or two levels of normalization / scaling - metadata about cells (numbers, text), this often involves various categorical classifications of cells, e.g. processing batch, patient ID, disease status, cell-type, but can also involve various quality control information. - lower dimension representation (numbers), this includes a cells x dimensions matrix for each representation, and may include a genes x dimensions matrix of weights.
Different single-cell analysis packages use certain objects to organize these data, so functions can always find what they need.
Single cells isolated from the stria vascularis (SV) from the inner ear of lab mice (Korrapati et al. 2019). Technology: 10X Chromium Database: Gene Expression Omnibus GEO accession GSE136196. Preprocessing: cellranger(7.1.0)
We have data from a 10X experiment as a sparse matrix, where the data is stored as a triplet (row, column, value), so that only non-zero values are stored. Since our matrix 98% zero, this greatly reduces the amount of information we need to store!
Sparse matrices are defined in the Matrix
package and it
provides behind-the-scenes code that let us work with sparse matrices in
the exact same way as regular matrices.
Since one letter require the same amount of space to store as any
number, the text portion of the matrix (row and column names) are stored
once separately in features.tsv.gz
and
barcodes.tsv.gz
. While the triplets are stored in
matrix.mtx
as (row number, column number, umi count).
We could use Matrix::readMM()
to read in
matrix.mtx
and read.delim()
to read in the
text files and then put them together, but many packages provide a
function to automatically do this for us already.
To make a SingleCellExperiment
object automatically:
sce <- DropletUtils::read10xCounts("~/CourseData/scRNA_data/TAs_CourseData/Korrapati_MouseEar_Cells1/filtered_feature_bc_matrix/")
sce
## class: SingleCellExperiment
## dim: 36601 475
## metadata(1): Samples
## assays(1): counts
## rownames(36601): ENSG00000243485 ENSG00000237613 ... ENSG00000278817
## ENSG00000277196
## rowData names(3): ID Symbol Type
## colnames: NULL
## colData names(2): Sample Barcode
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
To make a SingleCellExperiment
object manually:
sparsematrix <- Matrix::readMM("~/CourseData/scRNA_data/TAs_CourseData/Korrapati_MouseEar_Cells1/filtered_feature_bc_matrix/matrix.mtx.gz")
my_rownames <- read.delim("~/CourseData/scRNA_data/TAs_CourseData/Korrapati_MouseEar_Cells1/filtered_feature_bc_matrix/features.tsv.gz", header=F)
my_colnames <- read.delim("~/CourseData/scRNA_data/TAs_CourseData/Korrapati_MouseEar_Cells1/filtered_feature_bc_matrix/barcodes.tsv.gz", header=F)
head(my_rownames)
## V1 V2 V3
## 1 ENSG00000243485 MIR1302-2HG Gene Expression
## 2 ENSG00000237613 FAM138A Gene Expression
## 3 ENSG00000186092 OR4F5 Gene Expression
## 4 ENSG00000238009 AL627309.1 Gene Expression
## 5 ENSG00000239945 AL627309.3 Gene Expression
## 6 ENSG00000239906 AL627309.2 Gene Expression
We need to pick which gene IDs we’ll use, we will stick with Ensembl IDs for now.
rownames(sparsematrix) <- my_rownames[,1]
colnames(sparsematrix) <- my_colnames[,1]
class(sparsematrix)
## [1] "dgTMatrix"
## attr(,"package")
## [1] "Matrix"
dim(sparsematrix)
## [1] 36601 475
sce2 <- SingleCellExperiment(
assays = list(counts = sparsematrix),
rowData = my_rownames
)
sce2
## class: SingleCellExperiment
## dim: 36601 475
## metadata(0):
## assays(1): counts
## rownames(36601): ENSG00000243485 ENSG00000237613 ... ENSG00000278817
## ENSG00000277196
## rowData names(3): V1 V2 V3
## colnames(475): AAACCTGAGCTAGTTC-1 AAACGGGCAGCTCCGA-1 ...
## TTTGCGCCATGTAGTC-1 TTTGTCAAGAGACTTA-1
## colData names(0):
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
Here we can see we have: - 36601 genes (rows) - 475 cells (columns) - 1 assay called “counts” - no lower dimensional representations - DropletUtils has automatically added information about each cell that is the “Sample” and “Barcode” which we don’t have when we made it manually.
Let’s check what DropletUtils did:
head(colData(sce), 10)
## DataFrame with 10 rows and 2 columns
## Sample Barcode
## <character> <character>
## 1 ~/CourseData/scRNA_d.. AAACCTGAGCTAGTTC-1
## 2 ~/CourseData/scRNA_d.. AAACGGGCAGCTCCGA-1
## 3 ~/CourseData/scRNA_d.. AAACGGGTCACCACCT-1
## 4 ~/CourseData/scRNA_d.. AAAGTAGCACATCCGG-1
## 5 ~/CourseData/scRNA_d.. AAAGTAGTCGGAAATA-1
## 6 ~/CourseData/scRNA_d.. AAATGCCCATTGGCGC-1
## 7 ~/CourseData/scRNA_d.. AACACGTGTGCACCAC-1
## 8 ~/CourseData/scRNA_d.. AACCATGGTCACCTAA-1
## 9 ~/CourseData/scRNA_d.. AACCATGTCCGAACGC-1
## 10 ~/CourseData/scRNA_d.. AACGTTGAGACCCACC-1
Here we retrieve the column data (cells) from the SCE using
colData()
then print the top 10 rows using
head()
. We can see that “Barcode” has stored the cell
barcode and “Sample” has stored the path to the folder we read in. This
is a bit messy to look at so let’s replace that with a shorter name for
this sample.
colData(sce)$Sample <- "Korrapati_exons"
head(colData(sce))
## DataFrame with 6 rows and 2 columns
## Sample Barcode
## <character> <character>
## 1 Korrapati_exons AAACCTGAGCTAGTTC-1
## 2 Korrapati_exons AAACGGGCAGCTCCGA-1
## 3 Korrapati_exons AAACGGGTCACCACCT-1
## 4 Korrapati_exons AAAGTAGCACATCCGG-1
## 5 Korrapati_exons AAAGTAGTCGGAAATA-1
## 6 Korrapati_exons AAATGCCCATTGGCGC-1
How many rows did head()
print by default?
If we look at the structure of our sce in the Environment browser or
using str()
we can see all the different slots in our SCE
object.
Exercise 1 - Working with SingleCellExperiment Objects Using the following “accessor” functions obtain the following information from the SCE object:
- the gene symbol of the 1013th row.
- the expression level for the 19108th gene in the 271th cell.
- the cellbarcodes of the first 25 cells.
Function | Description |
---|---|
rowData(sce) |
Table of gene metadata. |
colData(sce) |
Table of cell metadata. |
assay(sce, "counts") |
The assay named “counts”. |
reducedDim(sce, "PCA") |
The reduced dimensionality table named “PCA” |
sce$colname |
Shortcut to access the column “colname” from
colData . This is equivalent to
colData(sce)$colname |
sce[<rows>, <columns>] |
We can use the square brackets to subset the SCE object
by rows or columns, similarly to how we subset a matrix or
data.frame |
:::
Naming Assays
Assays can have any name we wish. However, there are some conventions we can follow:
counts
: Raw count data, e.g. number of reads or transcripts for a particular gene.normcounts
: Normalized values on the same scale as the original counts. For example, counts divided by cell-specific size factors that are centred at unity.logcounts
: Log-transformed counts or count-like values. In most cases, this will be defined as log-transformed normcounts, e.g. using log base 2 and a pseudo-count of 1.cpm
: Counts-per-million. This is the read count for each gene in each cell, divided by the library size of each cell in millions.tpm
: Transcripts-per-million. This is the number of transcripts for each gene in each cell, divided by the total number of transcripts in that cell (in millions).
Each of these has a function, so that we can access the “counts”
assay using the counts()
function. Therefore, these two are
equivalent:
counts(sce)
assay(sce, "counts")
While this is simply conventions, many packages we use will assume the assay names above and you may get errors if your assays are incorrectly named. Fortunately, many functions will automatically create the appropriately named assay.
The other commonly used software package for 10X Chromium data is
Seurat
. Like with SingleCellExperiment there are some
provided functions to make it easier to read data from cellranger.
sparsematrix <- Seurat::Read10X("~/CourseData/scRNA_data/TAs_CourseData/Korrapati_MouseEar_Cells1/filtered_feature_bc_matrix/")
seur_obj <- Seurat::CreateSeuratObject(sparsematrix)
seur_obj
## An object of class Seurat
## 36601 features across 475 samples within 1 assay
## Active assay: RNA (36601 features, 0 variable features)
Seurat has expanded its data object in order to store multi-modal data : e.g. RNA & ATACseq. So here we can see we have one assay called “RNA”, and we will have to look inside of “RNA” to find the count matrix and the normalized matrix (once we generate one).
Seurat::GetAssayData(seur_obj, assay="RNA", "counts")[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## AAACCTGAGCTAGTTC-1 AAACGGGCAGCTCCGA-1 AAACGGGTCACCACCT-1
## MIR1302-2HG . . .
## FAM138A . . .
## OR4F5 . . .
## AL627309.1 . . .
## AL627309.3 . . .
## AAAGTAGCACATCCGG-1 AAAGTAGTCGGAAATA-1
## MIR1302-2HG . .
## FAM138A . .
## OR4F5 . .
## AL627309.1 . .
## AL627309.3 . .
seur_obj@assays$RNA@counts[1:5,1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## AAACCTGAGCTAGTTC-1 AAACGGGCAGCTCCGA-1 AAACGGGTCACCACCT-1
## MIR1302-2HG . . .
## FAM138A . . .
## OR4F5 . . .
## AL627309.1 . . .
## AL627309.3 . . .
## AAAGTAGCACATCCGG-1 AAAGTAGTCGGAAATA-1
## MIR1302-2HG . .
## FAM138A . .
## OR4F5 . .
## AL627309.1 . .
## AL627309.3 . .
If we look at the structure of the Seurat Object as before, we can see that we have 3 slots for assays within our “RNA” slot: - “counts” : raw umi counts - “data” : normalized gene expression - “scale.data” : scaling each row of “data” to have a mean of 0 and standard deviation of 1
We also don’t have rowData
or colData
,
instead Seurat puts information about the cells into
meta.data
and doesn’t store gene information at all - it
uses gene symbols by default.
Seurat also automatically calculates some quality measures for us and puts them in the metadata. Let’s have a look:
head([email protected])
## orig.ident nCount_RNA nFeature_RNA
## AAACCTGAGCTAGTTC-1 SeuratProject 4596 1535
## AAACGGGCAGCTCCGA-1 SeuratProject 2191 952
## AAACGGGTCACCACCT-1 SeuratProject 1169 684
## AAAGTAGCACATCCGG-1 SeuratProject 3181 1117
## AAAGTAGTCGGAAATA-1 SeuratProject 1529 746
## AAATGCCCATTGGCGC-1 SeuratProject 832 411
Here we see the total UMI counts for each cell (“nCount_RNA”) and the
number of different genes detected per cell (“nFeature_RNA”). As with
DropletUtils, Seurat has automatically added a sample id
orig.ident
but this time it’s even less helpful.
Exercise 2 - Working with Seurat Object A) Fix the
orig.ident
column to contain a more useful name.
- Obtain/Calculate the following information:
- the UMI counts for the 501st gene in the 322nd cell.
- the mean detected genes per cell.
- the total UMIs detected in the whole dataset
Hints: sum()
calculates the sum of a vector.
mean()
calculates the mean of a vector.
GetAssayData()
accessed the expression matrices in a Seurat
Object.
Rather than fix the sample ID afterwards, we can set it when we create the Seurat Object:
seur_obj <- Seurat::CreateSeuratObject(sparsematrix, project="Korrapati_exons")
We’ve already seen that we can modify parts of the metadata. We can
use a similar approach to modify all parts of an object. However, each
slot has specific requirements that must be fulfilled to assign a new
value. You can use class()
to help figure out what is
required. These are to ensure the appropriate data is present in each
slot so that other functions that use the object can run correctly.
Exercise 3 Manually read in the
introns.mtx.gz
matrix. It contains counts for the number of
intronic reads only for our data. Replace the counts in our
seur_obj
with the intronic counts.
We could also create a new Assay object for our intronic counts and add it to the Seurat Object like so:
introns <- Seurat::CreateAssayObject(sparsematrix)
seur_obj@assays$Introns <- introns
To create your own analysis pipelines and explore your data it is helpful to have some quick ways to summarize data stored in matrices:
Function | Description |
---|---|
rowMeans(sparsematrix) |
mean of each row. |
colMeans(sparsematrix) |
mean of each column. |
rowSums(sparsematrix) |
sum of each row. |
colSums(sparsematrix) |
sum of each column. |
t() |
swaps rows and columns of a matrix. |
Let’s use these functions to calculate the same summary statistics we have in our SeuratObject for our SingleCellExperiment object.
colData(sce)$nCount <- colSums(counts(sce))
colData(sce)$nFeatures <- colSums(counts(sce) > 0)
When we do math with TRUE/FALSE values, the are swapped to numbers: TRUE = 1, FALSE = 0.
Similar to the standard data.frame
and
matrix
objects in R, we can use the [
operator
to subset our SingleCellExperiment or SeuratObject either by
rows (genes) or columns (cells). The general syntax
is:
sce[rows_of_interest, columns_of_interest]
.
We can use gene/column names, logical values, or row/column numbers.
For example, we might want to select only the cells with >1000 detected UMIs:
filtered_sce <- sce[,colData(sce)$nCount > 1000]
dim(filtered_sce)
## [1] 36601 217
Or we may want to remove mitochondrial genes from the matrix:
filtered_sce <- sce[!grepl("^MT-", rowData(sce)$Symbol),]
dim(filtered_sce)
## [1] 36588 475
grepl("^MT-")
uses a regular expression to find all the
genes that start with (“^”), a particular pattern (“MT-”). It returns
TRUE if the rownames matches the pattern and FALSE if it doesn’t. The
details of all the options for regular expressions is beyond the scope
of this course but use ?grepl
to see the related R
functions and see this website for more information on regular
expressions: Intro
to Regular Expressions
Exercise 4 - Subsetting objects 1. Subset our Seurat
object using the same criteria as the SCE. 2. Subset our SCE and Seurat
objects to contain only genes with total UMIs of at least 100 (hint:
look at the rowSums()
function) 3. Subset our Seurat object
to remove ribosomal genes (hint: their gene names start with “RPS” or
“RPL”)
KEY POINTS
The SingleCellExperiment (SCE) object is used to store expression data as well as information about our cells (columns) and genes (rows).
To create a new SCE object we can use the
SingleCellExperiment()
function. To read the output from cellranger we can use the dedicated functionDropletUtils::read10xCounts()
.The main parts of this object are:
- assay - one or more matrices of expression
quantification.
- There is one essential assay named “counts”, which contains the raw counts on which all other analyses are based on.
- rowData - information about our genes.
- colData - information about our cells.
- reducedDim - one or more reduced dimension representations of our data.
- assay - one or more matrices of expression
quantification.
We can add/modify parts of this object using the
The Seurat Object can store gene expression as well as other types of single-cell data such as scATAC or Multiome data. It stores count matrices as well as information about cells in a meta.data dataframe.
The main parts of this object are:
- assays - counts, log transformed + normalized counts (data), scaled data (scale.data)
- meta.data - information about the cells
- reductions - one or more reduced dimension representations of our data.
We can access parts of these object using ‘accessor’ functions or directly using
@
and$
notation. For exampleseur_obj@assays$RNA@counts
We can add/modify part of these objects using the assignment operator
<-
. For exampleassay(sce, "logcounts") <- log2(counts(sce) + 1)
would add a new assay named “logcounts” to our object.Matrix summary metrics are very useful to explore the properties of our expression data. Some of the more useful functions include
rowSums()
/colSums()
androwMeans()
/colMeans()
.Combining matrix summaries with conditional operators (
>
,<
,==
,!=
) can be used for conditional subsetting using[
.
View session info
## R version 4.3.0 (2023-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/Toronto
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] Matrix_1.6-0 DropletUtils_1.20.0
## [3] SingleCellExperiment_1.22.0 SummarizedExperiment_1.30.2
## [5] Biobase_2.60.0 GenomicRanges_1.52.0
## [7] GenomeInfoDb_1.36.1 IRanges_2.34.1
## [9] S4Vectors_0.38.1 BiocGenerics_0.46.0
## [11] MatrixGenerics_1.12.2 matrixStats_1.0.0
## [13] SeuratObject_4.1.3 Seurat_4.3.0.1
## [15] knitr_1.43
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.15.0
## [3] jsonlite_1.8.7 magrittr_2.0.3
## [5] spatstat.utils_3.0-3 rmarkdown_2.23
## [7] zlibbioc_1.46.0 vctrs_0.6.3
## [9] ROCR_1.0-11 DelayedMatrixStats_1.22.1
## [11] spatstat.explore_3.2-1 RCurl_1.98-1.12
## [13] S4Arrays_1.0.4 htmltools_0.5.5
## [15] Rhdf5lib_1.22.0 rhdf5_2.44.0
## [17] sass_0.4.7 sctransform_0.3.5
## [19] parallelly_1.36.0 KernSmooth_2.23-22
## [21] bslib_0.5.0 htmlwidgets_1.6.2
## [23] ica_1.0-3 plyr_1.8.8
## [25] plotly_4.10.2 zoo_1.8-12
## [27] cachem_1.0.8 igraph_1.5.0
## [29] mime_0.12 lifecycle_1.0.3
## [31] pkgconfig_2.0.3 R6_2.5.1
## [33] fastmap_1.1.1 GenomeInfoDbData_1.2.10
## [35] fitdistrplus_1.1-11 future_1.33.0
## [37] shiny_1.7.4.1 digest_0.6.33
## [39] colorspace_2.1-0 patchwork_1.1.2
## [41] tensor_1.5 dqrng_0.3.0
## [43] irlba_2.3.5.1 beachmat_2.16.0
## [45] progressr_0.13.0 fansi_1.0.4
## [47] spatstat.sparse_3.0-2 httr_1.4.6
## [49] polyclip_1.10-4 abind_1.4-5
## [51] compiler_4.3.0 BiocParallel_1.34.2
## [53] R.utils_2.12.2 HDF5Array_1.28.1
## [55] MASS_7.3-60 DelayedArray_0.26.6
## [57] tools_4.3.0 lmtest_0.9-40
## [59] httpuv_1.6.11 future.apply_1.11.0
## [61] goftest_1.2-3 R.oo_1.25.0
## [63] glue_1.6.2 rhdf5filters_1.12.1
## [65] nlme_3.1-162 promises_1.2.0.1
## [67] grid_4.3.0 Rtsne_0.16
## [69] cluster_2.1.4 reshape2_1.4.4
## [71] generics_0.1.3 gtable_0.3.3
## [73] spatstat.data_3.0-1 R.methodsS3_1.8.2
## [75] tidyr_1.3.0 data.table_1.14.8
## [77] sp_2.0-0 utf8_1.2.3
## [79] XVector_0.40.0 spatstat.geom_3.2-2
## [81] RcppAnnoy_0.0.21 ggrepel_0.9.3
## [83] RANN_2.6.1 pillar_1.9.0
## [85] stringr_1.5.0 limma_3.56.2
## [87] later_1.3.1 splines_4.3.0
## [89] dplyr_1.1.2 lattice_0.21-8
## [91] survival_3.5-5 deldir_1.0-9
## [93] tidyselect_1.2.0 locfit_1.5-9.8
## [95] scuttle_1.10.1 miniUI_0.1.1.1
## [97] pbapply_1.7-2 gridExtra_2.3
## [99] edgeR_3.42.4 scattermore_1.2
## [101] xfun_0.39 stringi_1.7.12
## [103] lazyeval_0.2.2 yaml_2.3.7
## [105] evaluate_0.21 codetools_0.2-19
## [107] tibble_3.2.1 cli_3.6.1
## [109] uwot_0.1.16 xtable_1.8-4
## [111] reticulate_1.30 munsell_0.5.0
## [113] jquerylib_0.1.4 Rcpp_1.0.11
## [115] globals_0.16.2 spatstat.random_3.1-5
## [117] png_0.1-8 parallel_4.3.0
## [119] ellipsis_0.3.2 ggplot2_3.4.2
## [121] sparseMatrixStats_1.12.2 bitops_1.0-7
## [123] listenv_0.9.0 viridisLite_0.4.2
## [125] scales_1.2.1 ggridges_0.5.4
## [127] crayon_1.5.2 leiden_0.4.3
## [129] purrr_1.0.1 rlang_1.1.1
## [131] cowplot_1.1.1