Obtaining DEGs (Differentially Expressed Genes) in bulk RNA-seq (DESeq2, t test, ANOVA)
Recommended Post : 【Bioinformatics】 Table of Contents for Bioinformatics Analysis
1. RStudio Functions
2. Example
a. Transcriptome Analysis Pipeline
1. RStudio Functions
⑴ Obtaining DEGs based on DESeq2 for 2 groups
a = 'DSS_ST_late'
b = 'DSS_Saline_late'
Idents(pbmc_gene) <- pbmc_gene$orig.ident
pbmc_gene_ = pbmc_gene[, pbmc_gene$orig.ident %in% c(a, b)]
counts <- pbmc_gene_@assays$RNA@counts
sample_info <- data.frame(
row.names = colnames(counts),
condition = pbmc_gene_$orig.ident[colnames(counts)]
)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = sample_info,
design = ~ condition
)
dds <- DESeq(dds)
res <- results(dds, contrast=c("condition", a, b))
res_ordered <- res[order(res$log2FoldChange), ]
res_significant <- subset(res_ordered, padj < 0.05)
res_significant
⑵ Obtaining DEGs based on DESeq2 for multiple groups
library(compGenomRData)
library(DESeq2)
library(stats)
counts_file <- system.file ("extdata/rna-seq/SRP029880.raw_counts.tsv", package = "compGenomRData")
coldata_file <- system.file ("extdata/rna-seq/SRP029880.colData.tsv", package = "compGenomRData")
counts <- as.matrix(read.table(counts_file, header = T, sep = '\t'))
countData <- as.matrix(subset(counts, select = c(-width)))
colData <- read.table(coldata_file, header = T, sep = '\t',
stringsAsFactors = TRUE)
designFormula <- "~ group"
dds <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = as.formula(designFormula))
dds <- dds[ rowSums(DESeq2::counts(dds)) > 1, ]
dds <- DESeq(dds)
DEresults = results(dds, contrast = c("group", 'CASE', 'CTRL'))
DEresults <- DEresults[order(DEresults$pvalue),]
⑶ t test for two groups
t.test(v1, v2, paired = FALSE)
# maybe you can activate 'paired' in a special condition
⑷ ANOVA test for two groups
one_way_2_factor_anova <- function(v1, v2){
dat <- matrix(0, nrow = ( length(v1) + length(v2) ), ncol = 2 )
for(i in 1 : length(v1) ){
dat[i, 1] <- v1[i]
dat[i, 2] <- 'v1'
}
for(i in 1 : length(v2) ){
dat[i + length(v1), 1] <- v2[i]
dat[i + length(v1), 2] <- 'v2'
}
dat <- as.data.frame(dat)
colnames(dat) <- c('val', 'factor')
anova_IS <- aov(val ~ factor, data = dat)
print(summary(anova_IS))
anova_residuals <- anova_IS$residuals
print(summary(anova_residuals))
}
⑸ ANOVA test for three groups
one_way_3_factor_anova <- function(v1, v2, v3){
dat <- matrix(0, nrow = ( length(v1) + length(v2) + length(v3) ), ncol = 2 )
for(i in 1 : length(v1) ){
dat[i, 1] <- v1[i]
dat[i, 2] <- 'v1'
}
for(i in 1 : length(v2) ){
dat[i + length(v1), 1] <- v2[i]
dat[i + length(v1), 2] <- 'v2'
}
for(i in 1 : length(v3) ){
dat[i + length(v1) + length(v2), 1] <- v3[i]
dat[i + length(v1) + length(v2), 2] <- 'v3'
}
dat <- as.data.frame(dat)
colnames(dat) <- c('val', 'factor')
anova_IS <- aov(val ~ factor, data = dat)
print(summary(anova_IS))
anova_residuals <- anova_IS$residuals
print(summary(anova_residuals))
}
⑹ ANOVA for four groups
one_way_4_factor_anova <- function(v1, v2, v3, v4){
dat <- matrix(0, nrow = ( length(v1) + length(v2) + length(v3) + length(v4) ), ncol = 2 )
for(i in 1 : length(v1) ){
dat[i, 1] <- v1[i]
dat[i, 2] <- 'v1'
}
for(i in 1 : length(v2) ){
dat[i + length(v1), 1] <- v2[i]
dat[i + length(v1), 2] <- 'v2'
}
for(i in 1 : length(v3) ){
dat[i + length(v1) + length(v2), 1] <- v3[i]
dat[i + length(v1) + length(v2), 2] <- 'v3'
}
for(i in 1 : length(v4) ){
dat[i + length(v1) + length(v2) + length(v3), 1] <- v4[i]
dat[i + length(v1) + length(v2) + length(v3), 2] <- 'v4'
}
dat <- as.data.frame(dat)
colnames(dat) <- c('val', 'factor')
anova_IS <- aov(val ~ factor, data = dat)
print(summary(anova_IS))
anova_residuals <- anova_IS$residuals
print(summary(anova_residuals))
}
2. Example
# http://compgenomr.github.io/book/gene-expression-analysis-using-high-throughput-sequencing-technologies.html#quantification
### step 1. 데이터 로딩 ###
library(dplyr)
# install.packages("Seurat")
library(Seurat)
# 출처 : https://github.com/compgenomr/compGenomRData
devtools::install_github("compgenomr/compGenomRData")
library(compGenomRData)
counts_file <- system.file ("extdata/rna-seq/SRP029880.raw_counts.tsv", package = "compGenomRData")
coldata_file <- system.file ("extdata/rna-seq/SRP029880.colData.tsv", package = "compGenomRData")
counts <- as.matrix(read.table(counts_file, header = T, sep = '\t'))
### step 2. TPM 정규화 ###
library(pheatmap)
library(stats)
library(ggplot2)
cpm <- apply(subset(counts, select = c(-width)), 2,
function(x) x/sum(as.numeric(x)) * 10^6)
geneLengths <- as.vector(subset(counts, select = c(width)))
rpkm <- apply(X = subset(counts, select = c(-width)),
MARGIN = 2,
FUN = function(x) {
10^9 * x / geneLengths / sum(as.numeric(x))
})
rpk <- apply( subset(counts, select = c(-width)), 2,
function(x) x/(geneLengths/1000))
tpm <- apply(rpk, 2, function(x) x / sum(as.numeric(x)) * 10^6)
V <- apply(tpm, 1, var)
selectedGenes <- names(V[order(V, decreasing = T)][1:100])
colData <- read.table(coldata_file, header = T, sep = '\t',
stringsAsFactors = TRUE)
M <- t(tpm[selectedGenes,])
M <- log2(M + 1)
pcaResults <- prcomp(M)
correlationMatrix <- cor(tpm)
### step 3. DESeq2 DEG 획득 ###
library(DESeq2)
library(stats)
countData <- as.matrix(subset(counts, select = c(-width)))
colData <- read.table(coldata_file, header = T, sep = '\t',
stringsAsFactors = TRUE)
designFormula <- "~ group"
dds <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = as.formula(designFormula))
dds <- dds[ rowSums(DESeq2::counts(dds)) > 1, ]
dds <- DESeq(dds)
DEresults = results(dds, contrast = c("group", 'CASE', 'CTRL'))
DEresults <- DEresults[order(DEresults$pvalue),]
write.csv(DEresults, "~/Downloads/DEresults.csv")
DEresults_trim <- 0
flag = 0
for(i in 1: dim(DEresults)[1]){
a = sum(DEresults[1:i, ]$padj < 0.05)
b = sum(DEresults[1:i, ]$pvalue < 0.05)
if(flag == 0 && a / b < 0.99){
flag = 1
DEresults_trim <- DEresults[1:i-1, ]
write.csv(DEresults[1:i-1, ], "~/Downloads/DEresults(FDR = 1%).csv")
}
}
### step 4. t.test DEG 획득 ###
pbmc <- CreateSeuratObject(counts = tpm, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc <- FindVariableFeatures(object = pbmc, selection.method = "vst", nfeatures = 2000)
all.genes <- rownames(x = pbmc)
markers_t.test <- FindMarkers(object = pbmc, test.use ='t', only.pos = TRUE, min.pct = 0.1, logfc.threshold = 0.25, group.by = 'orig.ident', ident.1 = 'CASE')
write.csv(markers_t.test, "~/Downloads/t.test.csv")
markers_t.test_trim <- 0
flag = 0
for(i in 1: dim(markers_t.test)[1]){
a = sum(markers_t.test[1:i, ]$p_val_adj < 0.05)
b = sum(markers_t.test[1:i, ]$p_val < 0.05)
if(flag == 0 && a / b < 0.99){
flag = 1
markers_t.test_trim <- markers_t.test[1:i-1, ]
write.csv(markers_t.test[1:i-1, ], "~/Downloads/t.test(FDR = 1%).csv")
}
}
### step 5. 유전자 교집합 ###
inter <- intersect(rownames(DEresults_trim), rownames(markers_t.test_trim))
write.csv(inter, "~/Downloads/inter.csv")
⑴ Results
DESeq2 identified 7,050 DEGs, and t-test identified 9 DEGs. There are 9 overlapping genes between them.
⑵ Interpretation
The reason for the differences is primarily due to different assumptions. DESeq2 assumes a negative binomial distribution as the underlying distribution, while t-test assumes the student’s T test. The negative binomial distribution is known to be a more suitable assumption for RNA-seq data, and although the distributions become similar when the sample size is large, for small sample sizes, the inaccuracies of the t-test are more pronounced. Fortunately, since all t-test results were consistent with DESeq2, it can be said that robustness is observed regardless of the type of statistical analysis. However, the reason for the lower number of DEGs observed in the t-test is attributed to such inaccuracies.
Input : 2022.06.08 13:50