Determining Cell Types with scater
Recommended Article : 【Bioinformatics】 Table of Contents for Bioinformatics Analysis
5. Dimensionality Reduction Techniques
a. Cell Type Classification Pipeline
b. Determining Cell Types with Seurat
c. Determining Cell Types with scanpy
1. Installing Packages
install.packages("BiocManager")
BiocManager::install("scRNAseq")
browseVignettes("scRNAseq")
# starting httpd help server ... done
BiocManager::install("scater")
browseVignettes("scater")
# starting httpd help server ... done
2. Setting up the Dataset
library(scRNAseq)
example_sce <- ZeiselBrainData()
example_sce
# class: SingleCellExperiment
# dim: 20006 3005
# metadata(0):
# assays(1): counts
# rownames(20006): Tspan12 Tshz1 ... mt-Rnr1 mt-Nd4l
# rowData names(1): featureType
# colnames(3005): 1772071015_C02 1772071017_G12 ... 1772066098_A12
# 1772058148_F03
# colData names(10): tissue group # ... level1class level2class
# reducedDimNames(0):
# spikeNames(0):
# altExpNames(2): ERCC repeat
str(counts(example_sce))
# int [1:20006, 1:3005] 0 3 3 0 1 0 0 11 1 0 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:20006] "Tspan12" "Tshz1" "Fnbp1l" "Adamts15" ...
# ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
example_sce$whee <- sample(LETTERS, ncol(example_sce), replace=TRUE)
colData(example_sce)
# DataFrame with 3005 rows and 11 columns
# tissue group # total mRNA mol well sex
# <character> <numeric> <numeric> <numeric> <numeric>
# 1772071015_C02 sscortex 1 1221 3 3
# 1772071017_G12 sscortex 1 1231 95 1
# 1772071017_A05 sscortex 1 1652 27 1
# 1772071014_B06 sscortex 1 1696 37 3
# 1772067065_H06 sscortex 1 1219 43 3
# ... ... ... ... ... ...
# 1772067059_B04 ca1hippocampus 9 1997 19 1
# 1772066097_D04 ca1hippocampus 9 1415 21 1
# 1772063068_D01 sscortex 9 1876 34 3
# 1772066098_A12 ca1hippocampus 9 1546 88 1
# 1772058148_F03 sscortex 9 1970 15 3
# age diameter cell_id level1class
# <numeric> <numeric> <character> <character>
# 1772071015_C02 2 1 1772071015_C02 interneurons
# 1772071017_G12 1 353 1772071017_G12 interneurons
# 1772071017_A05 1 13 1772071017_A05 interneurons
# 1772071014_B06 2 19 1772071014_B06 interneurons
# 1772067065_H06 6 12 1772067065_H06 interneurons
# ... ... ... ... ...
# 1772067059_B04 4 382 1772067059_B04 endothelial-mural
# 1772066097_D04 7 12 1772066097_D04 endothelial-mural
# 1772063068_D01 7 268 1772063068_D01 endothelial-mural
# 1772066098_A12 7 324 1772066098_A12 endothelial-mural
# 1772058148_F03 7 6 1772058148_F03 endothelial-mural
# level2class whee
# <character> <character>
# 1772071015_C02 Int10 V
# 1772071017_G12 Int10 K
# 1772071017_A05 Int6 N
# 1772071014_B06 Int10 H
# 1772067065_H06 Int9 Z
# ... ... ...
# 1772067059_B04 Peric O
# 1772066097_D04 Vsmc K
# 1772063068_D01 Vsmc X
# 1772066098_A12 Vsmc H
# 1772058148_F03 Vsmc N
rowData(example_sce)$stuff <- runif(nrow(example_sce))
rowData(example_sce)
# DataFrame with 20006 rows and 2 columns
# featureType stuff
# <character> <numeric>
# Tspan12 endogenous 0.62180774891749
# Tshz1 endogenous 0.473351755877957
# Fnbp1l endogenous 0.710615682648495
# Adamts15 endogenous 0.189241249114275
# Cldn12 endogenous 0.211544606601819
# ... ... ...
# mt-Co2 mito 0.916890940628946
# mt-Co1 mito 0.747661913745105
# mt-Rnr2 mito 0.51556776673533
# mt-Rnr1 mito 0.917173949768767
# mt-Nd4l mito 0.74331742990762
3. Quality Control
⑴ Cell Level Quality Control
library(scater)
per.cell <- perCellQCMetrics(example_sce, subsets = list(Mito = grep("mt-", rownames(example_sce))))
summary(per.cell$sum)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 2574 8130 12913 14954 19284 63505
summary(per.cell$detected)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 785 2484 3656 3777 4929 8167
summary(per.cell$subsets_Mito_percent)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000 3.992 6.653 7.956 10.290 56.955
colData(example_sce) <- cbind(colData(example_sce), per.cell)
plotColData(example_sce, x = "sum", y="detected", colour_by="tissue")
plotColData(example_sce, x = "sum", y="subsets_Mito_percent", other_fields="tissue") + facet_wrap(~tissue)
keep.total <- example_sce$sum > 1e5
keep.n <- example_sce$detected > 500
filtered <- example_sce[,keep.total & keep.n]
keep.total <- isOutlier(per.cell$sum, type="lower", log=TRUE)
filtered <- example_sce[,keep.total]
qc.stats <- quickPerCellQC(per.cell, percent_subsets="subsets_Mito_percent")
colSums(as.matrix(qc.stats))
# low_lib_size low_n_features
# 0 3
# high_subsets_Mito_percent discard
# 128 131
filtered <- example_sce[,!qc.stats$discard]
⑵ Feature Level Quality Control
per.feat <- perFeatureQCMetrics(example_sce, subsets = list(Empty = 1:10))
summary(per.feat$mean)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0007 0.0097 0.1338 0.7475 0.5763 732.1524
summary(per.feat$detected)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.03328 0.76539 9.01830 18.87800 31.24792 99.96672
summary(per.feat$subsets_Empty_ratio)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000 0.000 0.601 1.872 2.016 300.500
ave <- calculateAverage(example_sce)
summary(ave)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.0002 0.0109 0.1443 0.7475 0.5674 850.6880
summary(nexprs(example_sce, byrow = TRUE))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1.0 23.0 271.0 567.3 939.0 3004.0
plotHighestExprs(example_sce, exprs_values = "counts")
keep_feature <- nexprs(example_sce, byrow=TRUE) > 0
example_sce <- example_sce[keep_feature,]
⑶ Variable Level Quality Control
example_sce <- logNormCounts(example_sce)
vars <- getVarianceExplained(example_sce, variables = c("tissue", "total mRNA mol", "sex", "age"))
head(vars)
# tissue total mRNA mol sex age
# Tspan12 0.02207262 0.074086504 0.146344996 0.09472155
# Tshz1 3.36083014 0.003846487 0.001079356 0.31262288
# Fnbp1l 0.43597185 0.421086301 0.003071630 0.64964174
# Adamts15 0.54233888 0.005348505 0.030821621 0.01393787
# Cldn12 0.03506751 0.309128294 0.008341408 0.02363737
# Rxfp1 0.18559637 0.016290703 0.055646799 0.02128006
plotExplanatoryVariables(vars)
4. Expression Calculation
example_sce <- logNormCounts(example_sce)
assayNames(example_sce)
# [1] "counts" "logcounts"
summary(librarySizeFactors(example_sce))
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.1721 0.5437 0.8635 1.0000 1.2895 4.2466
cpm(example_sce) <- calculateCPM(example_sce)
assay(example_sce, "normed") <- normalizeCounts(example_sce, size_factors=runif(ncol(example_sce)), pseudo_count=1.5)
plotExpression(example_sce, rownames(example_sce)[1:6], x = "level1class")
plotExpression(example_sce, rownames(example_sce)[1:6], x = rownames(example_sce)[10])
plotExpression(example_sce, rownames(example_sce)[1:6], x = "level1class", colour_by="tissue")
plotExpression(example_sce, rownames(example_sce)[1:6])
5. Dimensionality Reduction Techniques : Choose between PCA and tSNE
⑴ PCA (Principal Components Analysis)
example_sce <- runPCA(example_sce)
str(reducedDim(example_sce, "PCA"))
# num [1:3005, 1:50] 15.4 15 17.2 16.9 18.4 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
# ..$ : chr [1:50] "PC1" "PC2" "PC3" "PC4" ...
# - attr(*, "percentVar")= num [1:50] 39.72 9.38 4.25 3.9 2.76 ...
example_sce <- runPCA(example_sce, name="PCA2", subset_row=rownames(example_sce)[1:1000], ncomponents=25)
str(reducedDim(example_sce, "PCA2"))
# num [1:3005, 1:25] 20 21 23 23.7 21.5 ...
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:3005] "1772071015_C02" "1772071017_G12" "1772071017_A05" "1772071014_B06" ...
# ..$ : chr [1:25] "PC1" "PC2" "PC3" "PC4" ...
# - attr(*, "percentVar")= num [1:25] 22.3 5.11 3.42 1.69 1.58 ...
plotReducedDim(example_sce, dimred = "PCA", colour_by = "level1class")
Figure 1. PCA Results
⑵ tSNE
example_sce <- runUMAP(example_sce)
head(reducedDim(example_sce, "UMAP"))
# [,1] [,2]
# 1772071015_C02 -10.85709 -6.258108
# 1772071017_G12 -10.94470 -6.291569
# 1772071017_A05 -10.80176 -6.240728
# 1772071014_B06 -10.93094 -6.276943
# 1772067065_H06 -10.94264 -6.278843
# 1772071017_E02 -10.96068 -6.285830
plotTSNE(example_sce, colour_by = "Snap25")
Figure 2. tSNE Results
6. Customization : If you want to analyze your own data with scater
Input: 2019-12-20 13:44