A collection of main functions useful in R

Higher category: 【RStudio】 R Studio Index, 【Bioinformatics】 Bioinformatics Analysis Index

1. Overview

2. Basics

3. File I/O

4. Vectors and Matrices

5. gsub

6. Statistical Analysis

7. Bioinformatics

8. Plot Generation

a. GitHub (executable .R file)

b. Useful Functions in Python

1. Overview

⑴ The functions defined below can be called as follows.

source("https://github.com/JB243/nate9389/blob/main/RStudio/A_Collection_of_Useful_Functions_in_RStudio.R?raw=true")

① However, the following problems may occur due to the above code.

② Problem 1. Error in sum (dim(scRNaseqData):argument “b” is missing, with no default

③ Problem 2. “do.call” r argument “b” is missing, with no default

⑵ .R file can be used as follows:

Rscript /numbat/inst/bin/pileup_and_phase.R \
    --label possorted_genome_bam \
    --samples possorted_genome_bam \
    --bams possorted_genome_bam.bam \
    --barcodes possorted_genome_bam_barcodes.tsv \
    --outdir possorted_genome_bam \
    --gmap /Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
    --snpvcf /data/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
    --paneldir /data/1000G_hg38 \
    --ncores 1

⑶ The functions defined below can be used as follows:

SUM <- function(a, b){
    return (a + b)
}

A <- 1
B <- 2
print(SUM(A, B))
# 3

2. Basics

⑴ Check data type

# x is a variable
typeof(x)

⑵ A function that fills in the number of zeros before a natural number to give a total of five digits.

n = 55
sprintf("%05d", n)

⑶ A function waiting for enter to hit. (reference)

readline(prompt="Press [enter] to continue")

⑷ Multiple processing

# Reference : https://cran.r-project.org/web/packages/future/vignettes/future-3-topologies.html

plan("multisession", workers = 10)

⑸ %in% operator

x <- c('A','B','C','D','E','F','G')
x %in% c('F','G')
# FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE

3. File I/O

⑴ Check whether destination folder exists

if (!file.exists(destination.folder)) {
    stop(.wrap("The destination folder could not be found. Please change",
               "the path specified in", sQuote(destination.folder)))
}

⑵ Check for write permissions in the destination folder

if (file.access(destination.folder, 2) == -1) {
    stop(.wrap("You do not have write permission in the destination",
               "folder."))

⑶ Create a folder of ‘Filtered_bams’ under ‘destination.folder’

destination.folder <- file.path(destination.folder, "Filtered_bams")
tryCatch({
    if (!file.exists(file.path(destination.folder))) {
        dir.create(file.path(destination.folder),
                   recursive = TRUE)
    } else {
        stop(.wrap("The folder",
                   sQuote(file.path(destination.folder, "Filtered_bams")),
                   "already exists. Please remove it, or (in case you",
                   "still need it), rename it to prevent files from being",
                   "overwritten."))
    }
}, warning = function(e) {
    stop(.wrap("You do not have write permissions in the destination",
               "folder. Stopping execution of the remaining part of the",
               "script..."))
})

⑷ Code to read a .gz file and decompress it

tmp = readMM('matrix.mtx.gz')
writeMM(tmp, 'matrix.mtx')

4. Vectors and Matrices

⑴ Sort the given vectors in the order of a, b, and c.

my_vec <- c('c', 'b', 'a')

print( my_vec[order(my_vec)] )
# "a" "b" "c"

⑵ Whether a given string (given_str) contains a particular string (partial_str): grepl (reference)

grepl(partial_str, given_str, fixed = TRUE)

⑶ An index output containing a particular string “str” from a given string vector.

has_string_arr <- function(str, arr){
    flag <- 0
    ar <- array()
    for(i in 1:length(arr)){
        if( grepl(str, arr[i], fixed=TRUE) ){
            flag <- flag + 1
            ar[flag] <- i
        }
    }
    return(ar)
}

⑷ Intersection of three vectors

tri_intersect <- function(A, B, C){
    a <- intersect(A, B)
    b <- intersect(a, C)
    return(b)
}

⑸ A function that converts the NA portion of a given vector into zero

trim_na <- function(vector){
  for(i in 1: length(vector)){
    if(is.na(vector[i])){
      vector[i] = 0
    }
  }
  return (vector)
}

⑹ A function that excludes the NA portion of a given vector

ignore_na <- function(vector){
  flag = 0
  ar = array()
  for(i in 1: length(vector)){
    if(! is.na(vector[i])){
      flag = flag + 1
      ar[flag] = vector[i]
    }
  }
  return (ar)
}

⑺ A function that replaces a particular element with another element in a given vector

replace_in_vector <- function(v, from_element, to_element){
  ar <- array(dim = length(v))

  for(i in 1 : length(v)){
    if(v[i] == from_element){
      ar[i] = to_element
    } else{
      ar[i] = v[i]
    }
  }

  return(ar)
}

⑻ A function that returns the left corner of a matrix or data frame

corner <- function(x, num = 10){
  return(x[1:min(  num, dim(x)[1]  ), 
           1:min(  num, dim(x)[2]  )])
}

⑼ A function that outputs cbind when given any two matrices

my.cbind <- function(data1, data2){
  a <- intersect(rownames(data1), rownames(data2))
  return (cbind(data1[a, ], data2[a, ]))
}

⑽ A function that outputs rbind when given any two matrices

my.rbind <- function(data1, data2){
  a <- intersect(colnames(data1), colnames(data2))
  return (rbind(data1[, a], data2[, a]))
}

⑾ A function that outputs lines A and B in a given matrix or data frame

switch_A_B_row <- function(mat0, A, B){
  mat <- mat0
  mat[A, ] = mat0[B, ]
  mat[B, ] = mat0[A, ] 
  rownames(mat)[A] = rownames(mat0)[B]
  rownames(mat)[B] = rownames(mat0)[A]
  return(mat)
}

⑿ A function that switches columns A and B in a given matrix or data frame

switch_A_B_col <- function(mat0, A, B){
  mat <- mat0
  mat[, A] = mat0[, B]
  mat[, B] = mat0[, A] 
  colnames(mat)[A] = colnames(mat0)[B]
  colnames(mat)[B] = colnames(mat0)[A]
  return(mat)
}

5. gsub

⑴ A function that removes the hyphen (-) from a given string

eliminate_hyphene <- function(str){
    return(gsub("-", "", str))
}

⑵ A function of removing special symbols such as “(“) from a given string (reference)

eliminate_symbol <- function(str){
    return(gsub("(", "", str, fixed = TRUE))
}

⑶ A function that removes all characters after x and x from a given string (reference)

eliminate_backward <- function(str){
    return(gsub('x.*$', '', str))
}

⑷ A function that removes all numbers following . and . from a given string, for “ABCD.123”

eliminate_backward <- function(str){
    return(gsub('*.[0-9]*$', '', str))
}

⑸ A function that removes all characters before x and x from a given string

eliminate_forward <- function(str){
    return(gsub('.*x', '', str))
}

⑹ A function that fills the characters from start to end with “x” in a given string-type variable (‘given_str’) and returns them

str_substitute <- function(given_str, start, end){
    library(stringr)
    library(stringi)
  
    result = paste0(
        str_sub(given_str, 1, start-1),    
        stri_dup("x", (end-start+1) ),
        str_sub(given_str, end+1, str_length(given_str))
    )

    return(result)
}

⑺ Code for collecting numbers following a particular pattern within a given string (‘given_str’)

number_after_pattern_in_str <- function(given_str, pattern){
    # reference : https://cran.r-project.org/web/packages/stringr/vignettes/stringr.html

    library(stringr)

    phone1 <- paste0(pattern, '([1-9]{1})')
    phone2 <- paste0(pattern, '([1-9][0-9]{1})')
    phone3 <- paste0(pattern, '([1-9][0-9]{2})')
    phone4 <- paste0(pattern, '([1-9][0-9]{3})')

    arr <- array()
    flag = 0

    for(i in 1 : dim(str_locate_all(given_str, phone4)[[1]])[1] ){
        if (dim(str_locate_all(given_str, phone4)[[1]])[1] > 0){
            flag <- flag + 1
            arr[flag] = str_sub(
                given_str, 
                str_locate_all(given_str, phone4)[[1]][1, 1], #start
                str_locate_all(given_str, phone4)[[1]][1, 2]  #end
            )
            given_str = str_substitute(
                given_str, 
                str_locate_all(given_str, phone4)[[1]][1, 1], #start
                str_locate_all(given_str, phone4)[[1]][1, 2]  #end
            )
        }
    }

    for(i in 1 : dim(str_locate_all(given_str, phone3)[[1]])[1] ){
        if (dim(str_locate_all(given_str, phone3)[[1]])[1] > 0){
            flag <- flag + 1
            arr[flag] = str_sub(
                given_str, 
                str_locate_all(given_str, phone3)[[1]][1, 1], #start
                str_locate_all(given_str, phone3)[[1]][1, 2]  #end
            )
            given_str = str_substitute(
                given_str, 
                str_locate_all(given_str, phone3)[[1]][1, 1], #start
                str_locate_all(given_str, phone3)[[1]][1, 2]  #end
            )
        }
    }

    for(i in 1 : dim(str_locate_all(given_str, phone2)[[1]])[1] ){
        if (dim(str_locate_all(given_str, phone2)[[1]])[1] > 0){
            flag <- flag + 1
            arr[flag] = str_sub(
                given_str, 
                str_locate_all(given_str, phone2)[[1]][1, 1], #start
                str_locate_all(given_str, phone2)[[1]][1, 2]  #end
            )
            given_str = str_substitute(
                given_str, 
                str_locate_all(given_str, phone2)[[1]][1, 1], #start
                str_locate_all(given_str, phone2)[[1]][1, 2]  #end
            )
        }
    }

    for(i in 1 : dim(str_locate_all(given_str, phone1)[[1]])[1] ){
        if (dim(str_locate_all(given_str, phone1)[[1]])[1] > 0){
            flag <- flag + 1
            arr[flag] = str_sub(
                given_str, 
                str_locate_all(given_str, phone1)[[1]][1, 1], #start
                str_locate_all(given_str, phone1)[[1]][1, 2]  #end
            )
            given_str = str_substitute(
                given_str, 
                str_locate_all(given_str, phone1)[[1]][1, 1], #start
                str_locate_all(given_str, phone1)[[1]][1, 2]  #end
            )
        }
    }	

    arr <- str_sub(arr, str_length(pattern)+1, str_length(arr))
    arr <- as.numeric(arr)
    return(arr)
}

6. Statistical Analysis

⑴ Code for percentile quantiles from 10% to 90%

# reference: https://www.statology.org/percentiles-in-r/

quantile(data, probs = seq(.1, .9, by = .1))

⑵ A value obtained by taking log10 for n!

log10_factorial <- function(n){
  if(n == 0){
    return(0)
  }

  out <- 0
  for(i in 1 : n){
    out <- out + log(i) / log(10)
  }
  return(out)
}

⑶ A function to obtain the binomial coefficient _nC_k, which is the number of cases where k out of n is selected: Improves code because simply using factorial can create inf

my.combination <- function(n, k){
  # return nCk = n! / ((n-k)! k!)
  
  if (n == k || n == 0 || k == 0){
    return(1)
  }

  A = log10_factorial(n)
  B = log10_factorial(n-k)
  C = log10_factorial(k)
  
  log10_nCk = A - B - C
  return(10^(log10_nCk))
}

⑷ t test between two groups

t.test(v1, v2, paired = FALSE)
# maybe you can activate 'paired' in a special condition

⑸ ANOVA test between two groups

one_way_2_factor_anova <- function(v1, v2){
  dat <- matrix(0, nrow = ( length(v1) + length(v2) ), ncol = 2 )
  for(i in 1 : length(v1) ){
    dat[i, 1] <- v1[i]
    dat[i, 2] <- 'v1'
  }
  for(i in 1 : length(v2) ){
    dat[i + length(v1), 1] <- v2[i]
    dat[i + length(v1), 2] <- 'v2'
  }
  dat <- as.data.frame(dat)
  
  colnames(dat) <- c('val', 'factor')

  anova_IS <- aov(val ~ factor, data = dat)
  print(summary(anova_IS))

  anova_residuals <- anova_IS$residuals
  print(summary(anova_residuals))
}

⑹ ANOVA test among three groups

one_way_3_factor_anova <- function(v1, v2, v3){
  dat <- matrix(0, nrow = ( length(v1) + length(v2) + length(v3) ), ncol = 2 )
  for(i in 1 : length(v1) ){
    dat[i, 1] <- v1[i]
    dat[i, 2] <- 'v1'
  }
  for(i in 1 : length(v2) ){
    dat[i + length(v1), 1] <- v2[i]
    dat[i + length(v1), 2] <- 'v2'
  }
  for(i in 1 : length(v3) ){
    dat[i + length(v1) + length(v2), 1] <- v3[i]
    dat[i + length(v1) + length(v2), 2] <- 'v3'
  }
  dat <- as.data.frame(dat)
  
  colnames(dat) <- c('val', 'factor')

  anova_IS <- aov(val ~ factor, data = dat)
  print(summary(anova_IS))

  anova_residuals <- anova_IS$residuals
  print(summary(anova_residuals))
}

⑺ ANOVA test among four groups

one_way_4_factor_anova <- function(v1, v2, v3, v4){
  dat <- matrix(0, nrow = ( length(v1) + length(v2) + length(v3) + length(v4) ), ncol = 2 )
  for(i in 1 : length(v1) ){
    dat[i, 1] <- v1[i]
    dat[i, 2] <- 'v1'
  }
  for(i in 1 : length(v2) ){
    dat[i + length(v1), 1] <- v2[i]
    dat[i + length(v1), 2] <- 'v2'
  }
  for(i in 1 : length(v3) ){
    dat[i + length(v1) + length(v2), 1] <- v3[i]
    dat[i + length(v1) + length(v2), 2] <- 'v3'
  }
  for(i in 1 : length(v4) ){
    dat[i + length(v1) + length(v2) + length(v3), 1] <- v4[i]
    dat[i + length(v1) + length(v2) + length(v3), 2] <- 'v4'
  }
  dat <- as.data.frame(dat)
  
  colnames(dat) <- c('val', 'factor')

  anova_IS <- aov(val ~ factor, data = dat)
  print(summary(anova_IS))

  anova_residuals <- anova_IS$residuals
  print(summary(anova_residuals))
}

⑻ ANOVA test among five groups

one_way_5_factor_anova <- function(v1, v2, v3, v4, v5){
  dat <- matrix(0, nrow = ( length(v1) + length(v2) + length(v3) + length(v4) + length(v5) ), ncol = 2 )
  for(i in 1 : length(v1) ){
    dat[i, 1] <- v1[i]
    dat[i, 2] <- 'v1'
  }
  for(i in 1 : length(v2) ){
    dat[i + length(v1), 1] <- v2[i]
    dat[i + length(v1), 2] <- 'v2'
  }
  for(i in 1 : length(v3) ){
    dat[i + length(v1) + length(v2), 1] <- v3[i]
    dat[i + length(v1) + length(v2), 2] <- 'v3'
  }
  for(i in 1 : length(v4) ){
    dat[i + length(v1) + length(v2) + length(v3), 1] <- v4[i]
    dat[i + length(v1) + length(v2) + length(v3), 2] <- 'v4'
  }
  for(i in 1 : length(v5) ){
    dat[i + length(v1) + length(v2) + length(v3) + length(v4), 1] <- v5[i]
    dat[i + length(v1) + length(v2) + length(v3) + length(v4), 2] <- 'v5'
  }
  dat <- as.data.frame(dat)
  
  colnames(dat) <- c('val', 'factor')

  anova_IS <- aov(val ~ factor, data = dat)
  print(summary(anova_IS))

  anova_residuals <- anova_IS$residuals
  print(summary(anova_residuals))
}

⑼ ANOVA test among six groups

one_way_6_factor_anova <- function(v1, v2, v3, v4, v5, v6){
  dat <- matrix(0, nrow = ( length(v1) + length(v2) + length(v3) + length(v4) + length(v5) + length(v6) ), ncol = 2 )
  for(i in 1 : length(v1) ){
    dat[i, 1] <- v1[i]
    dat[i, 2] <- 'v1'
  }
  for(i in 1 : length(v2) ){
    dat[i + length(v1), 1] <- v2[i]
    dat[i + length(v1), 2] <- 'v2'
  }
  for(i in 1 : length(v3) ){
    dat[i + length(v1) + length(v2), 1] <- v3[i]
    dat[i + length(v1) + length(v2), 2] <- 'v3'
  }
  for(i in 1 : length(v4) ){
    dat[i + length(v1) + length(v2) + length(v3), 1] <- v4[i]
    dat[i + length(v1) + length(v2) + length(v3), 2] <- 'v4'
  }
  for(i in 1 : length(v5) ){
    dat[i + length(v1) + length(v2) + length(v3) + length(v4), 1] <- v5[i]
    dat[i + length(v1) + length(v2) + length(v3) + length(v4), 2] <- 'v5'
  }
  for(i in 1 : length(v6) ){
    dat[i + length(v1) + length(v2) + length(v3) + length(v4) + length(v5), 1] <- v6[i]
    dat[i + length(v1) + length(v2) + length(v3) + length(v4) + length(v5), 2] <- 'v6'
  }
  dat <- as.data.frame(dat)
  
  colnames(dat) <- c('val', 'factor')

  anova_IS <- aov(val ~ factor, data = dat)
  print(summary(anova_IS))

  anova_residuals <- anova_IS$residuals
  print(summary(anova_residuals))
}

⑽ Investigate FC, p value of two vectors

comparison_of_two_vectors <- function(v1, v2, paired = FALSE){
  p.val = t.test(v1, v2, paired = paired)
  print(p.val)
  
  log2FC = log( mean(v1 + 0.000000000001)/mean(v2 + 0.000000000001) ) / log(2)
  print(log2FC)
}

⑾ How to test the statistical equivalence of two sets using Fisher’s effect test (ver. 1)

total.gene <- 32285
ST <- 31
scRNAseq <- 14
cross <- 5

a <- cross
b <- scRNAseq - a
c <- ST - a
d <- total.gene - a - b - c
A <- a + b
B <- c + d
C <- a + c
D <- b + d

group<-c("A","A","B","B")
cancer<-c("1.Yes","2.No","1.Yes","2.No")
count<-c(a,b,c,d)
dat<-data.frame(group,cancer,count)
tab<-xtabs(count~group+cancer,data=dat)
tab

chisq.test(tab)$observed
chisq.test(tab)$expected
fisher.test(tab)
-log(fisher.test(tab)$p.value, 10)

if(cross > ST * scRNAseq / total.gene){
  print("Enrichment")
} else if(cross < ST * scRNAseq / total.gene){
  print("Depletion")
}

⑿ How to test the statistical equivalence of two sets using Fisher’s effect test (ver. 2)

my.Fisher.exact.test <- function(total, A, B, cross){
  a1 <- log10_factorial(A)
  a2 <- log10_factorial(total - A)
  a3 <- log10_factorial(B)
  a4 <- log10_factorial(total - B)

  b1 <- log10_factorial(cross)
  b2 <- log10_factorial(A - cross)
  b3 <- log10_factorial(B - cross)
  b4 <- log10_factorial(total - cross - (A - cross) - (B - cross))
  b5 <- log10_factorial(total)

  out = a1 + a2 + a3 + a4 - b1 - b2 - b3 - b4 - b5
  return(10^out)
}

⒀ MIA assay (enrichment)

my.MIA.assay.enrichment <- function(total, A, B, cross){
  out <- 0
  for(i in cross:min(A, B)){
    out = out + my.Fisher.exact.test(total, A, B, i)
  }
  return(out)
}

⒁ MIA assay (depletion)

my.MIA.assay.depletion <- function(total, A, B, cross){
  out <- 0
  for(i in 0:cross-1){
    out = out + my.Fisher.exact.test(total, A, B, i)
  }
  return(out)
}

7. Bioinformatics

⑴ Explore genes that begin with a specific keyword

gene_starting_with <- function(keyword){
  human = read.csv("https://blog.kakaocdn.net/dn/29YTj/btrS5iG9QOH/Di6RQKxHOPDii7EjkdHN30/human_genes_36601.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = F)
  mouse = read.csv("https://blog.kakaocdn.net/dn/wkjwJ/btrS1QSgrpD/VS8ELANCQyeZAA3vL8JQP0/mouse_genes_32285.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = F)
  ar = array()
  flag = 0

  if(keyword == toupper(keyword)){ # human genes
    for(i in 1:dim(human)[1]){
      if(grepl(keyword, human[i, 2], fixed = TRUE)){
        flag = flag + 1
        ar[flag] = human[i, 2]
      }
    }   
  }   
  else{ # mouse genes
    for(i in 1:dim(mouse)[1]){
      if(grepl(keyword, mouse[i, 2], fixed = TRUE)){
        flag = flag + 1
        ar[flag] = mouse[i, 2]
      }
    }   
  }
  
  return(ar)
}

⑵ Conversion between enembl_gene_id and gene_symbol

① convert from ensembl.gene to gene.symbol

library(EnsDb.Hsapiens.v79)
ensembl.genes <- c("ENSG00000150676", "ENSG00000099308", "ENSG00000142676", "ENSG00000180776", "ENSG00000108848", "ENSG00000277370", "ENSG00000103811", "ENSG00000101473")
geneIDs1 <- ensembldb::select(EnsDb.Hsapiens.v79, keys= ensembl.genes, keytype = "GENEID", columns = c("SYMBOL","GENEID"))

② Convert from enembl.gene to gene.symbol (using the table)

○ human_genes_36601.tsv

○ mouse_genes_32285.tsv

ensembl_to_gene <- function(ensembl_list){
  ar = array(dim = length(ensembl_list))

  human = read.csv("https://blog.kakaocdn.net/dn/29YTj/btrS5iG9QOH/Di6RQKxHOPDii7EjkdHN30/human_genes_36601.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = F)
  mouse = read.csv("https://blog.kakaocdn.net/dn/wkjwJ/btrS1QSgrpD/VS8ELANCQyeZAA3vL8JQP0/mouse_genes_32285.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = F)

  for(i in 1:length(ensembl_list)){
    if(grepl('ENSG', ensembl_list[i], fixed = TRUE)){ # human gene
      index = match(ensembl_list[i], human[, 1])
      ar[i] = human[index, 2]
    } 
    else if(grepl('ENSMUSG', ensembl_list[i], fixed = TRUE)){ # mouse gene
      index = match(ensembl_list[i], mouse[, 1])
      ar[i] = mouse[index, 2]
    } 
  }
  return(ar)
}

③ Convert from gene.symbol to ensembl.gene

library(EnsDb.Hsapiens.v79)
geneSymbols <-  c('DDX26B','CCDC83',  'MAST3', 'RPL11', 'ZDHHC20',  'LUC7L3',  'SNORD49A',  'CTSH', 'ACOT8')
geneIDs2 <- ensembldb::select(EnsDb.Hsapiens.v79, keys= geneSymbols, keytype = "SYMBOL", columns = c("SYMBOL","GENEID"))

④ Convert from gen.symbol to enembl.gen (using the table)

gene_to_ensembl <- function(gene_list){
  ar = array(dim = length(gene_list))

  human = read.csv("https://blog.kakaocdn.net/dn/29YTj/btrS5iG9QOH/Di6RQKxHOPDii7EjkdHN30/human_genes_36601.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = F)
  mouse = read.csv("https://blog.kakaocdn.net/dn/wkjwJ/btrS1QSgrpD/VS8ELANCQyeZAA3vL8JQP0/mouse_genes_32285.tsv?attach=1&knm=tfile.tsv", sep = '\t', header = F)

  for(i in 1:length(gene_list)){
    if(gene_list[i] == toupper(gene_list[i])){ # human gene
      index = match(gene_list[i], human[, 2])
      ar[i] = human[index, 1]
    } 
    else{ # mouse gene
      index = match(gene_list[i], mouse[, 2])
      ar[i] = mouse[index, 1]
    } 
  }

  # return(ignore_na(ar)) 
  ## if possible, ignore_na should be used
  
  return(ar)
}

⑤ human ensembl transcript to gene name (ref.)

library(biomaRt)
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")  
ensembl_transcript_to_gene <- function(transcript_ids){
  # reference : https://support.bioconductor.org/p/106253/#106256

  res <- getBM(attributes = c('ensembl_transcript_id_version', 
                              'ensembl_gene_id', 
                              'external_transcript_name',
                              'external_gene_name'),
               filters = 'ensembl_transcript_id_version', 
               values = transcript_ids,
               mart = mart)

  if(dim(res)[1] == 0){
    return("")
  }	

  return(res[, 'external_gene_name'])
}

⑥ mouse gene to MGI symbol (using the table)

mouse_gene_to_MGI <- function(mouse_gene_list){
  ar = array(dim = length(mouse_gene_list))

	dat <- read.csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")

  for(i in 1:length(mouse_gene_list)){
	  index = match(mouse_gene_list[i], dat[,'Symbol'])
    ar[i] = dat[index, 'Mouse.MGI.ID']
  }

  return(ar)
}

⑦ MGI symbol to mouse gene (using the table)

MGI_to_mouse_gene <- function(MGI_list){
  ar = array(dim = length(MGI_list))

	dat <- read.csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")

  for(i in 1:length(MGI_list)){
	  index = match(MGI_list[i], dat[,'Mouse.MGI.ID'])
    ar[i] = dat[index, 'Symbol']
  }

  return(ar)
}

⑧ human gene to HGNC symbol (using the table)

human_gene_to_HGNC <- function(human_gene_list){
  ar = array(dim = length(human_gene_list))

	dat <- read.csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")

  for(i in 1:length(human_gene_list)){
	  index = match(human_gene_list[i], dat[,'Symbol'])
    ar[i] = dat[index, 'HGNC.ID']
  }

  return(ar)
}

⑨ HGNC symbol to human gene (using the table)

HGNC_to_human_gene <- function(HGNC_list){
  ar = array(dim = length(HGNC_list))

	dat <- read.csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")

  for(i in 1:length(HGNC_list)){
	  index = match(HGNC_list[i], dat[,'HGNC.ID'])
    ar[i] = dat[index, 'Symbol']
  }

  return(ar)
}

⑩ Transformation table for animals other than humans and mice

⑶ Affymetrix probe ID, enembl_gen_id, gain_name

① convert Affymetrix probe ID to ensembl_gene_id, gene_name

#Convert Affymetrix probe ID to ensembl_gene_id, gene_name
## https://www.biostars.org/p/328065/#328328
## https://www.biostars.org/p/332461/#332474
BiocManager::install("biomaRt", force=TRUE)
library(biomaRt)

dat<-c('1007_s_at', '1053_at', '117_at', '121_at', '1255_g_at', '1294_at')
require("biomaRt")
mart <- useMart("ENSEMBL_MART_ENSEMBL")
mart <- useDataset("hsapiens_gene_ensembl", mart)
annotLookup <- getBM(
  mart = mart,
  attributes = c(
    "affy_hg_u133_plus_2",
    "ensembl_gene_id",
    "gene_biotype",
    "external_gene_name"),
  filter = "affy_hg_u133_plus_2",
  values = dat,
  uniqueRows=TRUE)

⑷ Gene homology between human and mouse

① Overview

○ Human genes are 36601

○ There are 32285 genes in the mouse.

○ Human and mouse genes are almost the same, but there are many differences.

○ Therefore, it is necessary to know how to convert human genes into mouse genes or vice versa.

○ In many cases, the mouse gene is not a human gene that simply capitalizes the mouse gene.

② Method 1. BiomaRt

install.packages("BiocManager")
BiocManager::install("biomaRt")
library(biomaRt)

genes <- c("Xkr4", "Gm1992", "Gm37381")

human <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
hh <- getLDS(attributes = c("mgi_symbol", "ensembl_gene_id", "entrezgene_id"), filters = "mgi_symbol", values = genes, mart = mouse, attributesL =
c("hgnc_symbol", "ensembl_gene_id", "entrezgene_id"), martL = human, uniqueRows = T)

○ attribute 1. Determine the correspondence between MGI symbol and HGNC symbol.

○ attribute 2. Determine the corresponding relationship between ENSMUSG code and ENSG code.

○ attribute 3. Determine the correspondence between NCBI gain ID and NCBI gain ID.

○ For each attribute, information about the mouse on the left and information about the person on the right.

○ currently inoperative: Currently, the following error message has been found and is not working.

#1. Error in getLDS(attributes = c("mgi_symbol", "ensembl_gene_id", "entrezgene_id"), : Query ERROR: caught BioMart::Exception::Database: Could not connect to mysql database ensembl_mart_106: DBI connect ('database =ensembl_mart_106;host=127.0.0.1;port=5316','ensro',...) failed: Can't connect to MySQL server on '127.0.0.1' (111) at /nfs/public/ro/ensweb/live/mart/www_106/biomart-perl/lib/BioMart/Configuration/DBLocation.pm line 98.
#2. Error in textConnection(attrfilt) : invalid 'text' argument
#3. Ensembl site unresponsive, trying useast mirror. Ensembl site unresponsive, trying asia mirror
#4. Error: biomaRt has encountered an unexpected server error. Consider trying one of the Ensembl mirrors (for more details look at ?useEnsembl)

③ Method 2. Use of the human-mouse conversion table provided on the MGI website

○ HOM_MouseHumanSequence.csv

human_to_mouse <- function(human_gene){
  hom <- read.csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")

  mouse_gene = array()
  flag = 0

  for(i in 1 : length(human_gene)){
    index = match(human_gene[i], hom[hom$Common.Organism.Name == 'human', 'Symbol'])
    key = hom[hom$Common.Organism.Name == 'human', 'DB.Class.Key'][index]
    flag = flag + 1
    mouse_gene[flag] = hom[hom$DB.Class.Key == key 
                           & hom$Common.Organism.Name == 'mouse, laboratory'
                           , 'Symbol'][1] # duplicate mouse genes can be found
  }

  return(mouse_gene)
}

mouse_to_human <- function(mouse_gene){
  hom <- read.csv("https://blog.kakaocdn.net/dn/cVeqsA/btrS1JMnxyX/HtVhPmqtxdgt7LQlGkeql0/HOM_MouseHumanSequence.csv?attach=1&knm=tfile.csv")

  human_gene = array()
  flag = 0

  for(i in 1 : length(mouse_gene)){
    index = match(mouse_gene[i], hom[hom$Common.Organism.Name == 'mouse, laboratory', 'Symbol'])
    key = hom[hom$Common.Organism.Name == 'mouse, laboratory', 'DB.Class.Key'][index]
    flag = flag + 1
    human_gene[flag] = hom[hom$DB.Class.Key ==  key
                           & hom$Common.Organism.Name == 'human'
                           , 'Symbol'][1] # duplicate human genes can be found
  }

  return(human_gene)
}

⑸ chromosome position to hgnc_symbol

ChromosomePosition_to_hgnc_symbol <- function(chromosome, start, end){
  # reference : https://support.bioconductor.org/p/127035/

  library(biomaRt)
  positions <- data.frame(chromosome = chromosome,
                          start = start,
                          end = end)

  ensembl = useEnsembl(biomart='ensembl', 
                       dataset="hsapiens_gene_ensembl") 

  results <- getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position", "end_position"), 
                   filters = c("chromosome_name", "start", "end"),
                   values = list(positions[,1], positions[,2], positions[,3]),
                   mart = ensembl)

  print(results)

  postions_combined <- apply(as.matrix(positions), 1, paste, collapse = ":")

  results2 <- getBM(attributes = c("hgnc_symbol", "chromosome_name", "start_position", "end_position"), 
                   filters = c("chromosomal_region"),
                   values = postions_combined,
                   mart = ensembl)

  print(results2)
}

⑹ gene name to chromosome position: Use of the conversion table(human, mouse)

gene_to_chromosome_position <- function(gene_list){
  # gene_list : list of genes

  human <- read.csv("https://blog.kakaocdn.net/dn/lTbKq/btrWjlmGho8/eWwWHbfLOlVGKAVeuDMKt1/human%20gene%20annotation.csv?attach=1&knm=tfile.csv")
  mouse = read.csv("https://blog.kakaocdn.net/dn/clSwT7/btrWcrWmS41/mNLCUuBlQxfJFhG1U2JQNk/mouse%20gene%20annotation.csv?attach=1&knm=tfile.csv")

  result = array()
  
  for(i in 1:length(gene_list)){
    if(gene_list[i] == toupper(gene_list[i])){ # human gene
      idx = match(gene_list[i], human[, 1])
      note = paste('Gene.ID: ', gene_list[i],
                   ', chromosome: ', human[idx, 'chromosome'],
                   ', start: ', human[idx, 'start'],
                   ', end: ', human[idx, 'end'])
      result[i] = note
    }
    else{ # mouse gene
      idx = match(gene_list[i], mouse[, 1])
      note = paste('Gene.ID: ', gene_list[i],
                   ', chromosome: ', mouse[idx, 'chromosome'],
                   ', start: ', mouse[idx, 'start'],
                   ', end: ', mouse[idx, 'end'])
      result[i] = note
    }
  }
  
  return(result)
}

⑺ gene name to description

gene_to_description <- function(gene_list){
  # gene_list : list of genes

  human <- read.csv("https://blog.kakaocdn.net/dn/lTbKq/btrWjlmGho8/eWwWHbfLOlVGKAVeuDMKt1/human%20gene%20annotation.csv?attach=1&knm=tfile.csv")
  mouse = read.csv("https://blog.kakaocdn.net/dn/clSwT7/btrWcrWmS41/mNLCUuBlQxfJFhG1U2JQNk/mouse%20gene%20annotation.csv?attach=1&knm=tfile.csv")

  result = array()
  
  for(i in 1:length(gene_list)){
    if(gene_list[i] == toupper(gene_list[i])){ #human
      idx = match(gene_list[i], human[, 1])
      result[i] = human[idx, 'Description']
    }
    else{ #mouse 
      idx = match(gene_list[i], mouse[, 1])
      result[i] = mouse[idx, 'Description']
    }    
  }
  
  return(result)
}

⑻ gene name to bioType

gene_to_bioType <- function(gene_list){
  # gene_list : list of mouse genes

  human <- read.csv("https://blog.kakaocdn.net/dn/lTbKq/btrWjlmGho8/eWwWHbfLOlVGKAVeuDMKt1/human%20gene%20annotation.csv?attach=1&knm=tfile.csv")
  mouse = read.csv("https://blog.kakaocdn.net/dn/clSwT7/btrWcrWmS41/mNLCUuBlQxfJFhG1U2JQNk/mouse%20gene%20annotation.csv?attach=1&knm=tfile.csv")

  result = array()
  
  for(i in 1:length(gene_list)){
    if(gene_list[i] == toupper(gene_list[i])){ # human
      idx = match(gene_list[i], human[, 1])
      result[i] = human[idx, 'bioType']
    }
    else { # mouse
      idx = match(gene_list[i], mouse[, 1])
      result[i] = mouse[idx, 'bioType']
    }
  }
  
  return(result)
}

⑼ Change the name of the Seurat object object gene: Need to create a new object (reference)

# RenameGenesSeurat  ------------------------------------------------------------------------------------
RenameGenesSeurat <- function(obj = ls.Seurat[[i]], newnames = HGNC.updated[[i]]$Suggested.Symbol) { # Replace gene names in different slots of a Seurat object. Run this before integration. Run this before integration. It only changes obj@assays$RNA@counts, @data and @scale.data.
  print("Run this before integration. It only changes obj@assays$RNA@counts, @data and @scale.data.")
  RNA <- obj@assays$RNA

  if (nrow(RNA) == length(newnames)) {
    if (length(RNA@counts)) RNA@counts@Dimnames[[1]]            <- newnames
    if (length(RNA@data)) RNA@data@Dimnames[[1]]                <- newnames
    if (length(RNA@scale.data)) RNA@scale.data@Dimnames[[1]]    <- newnames
  } else {"Unequal gene sets: nrow(RNA) != nrow(newnames)"}
  obj@assays$RNA <- RNA
  return(obj)
}
# RenameGenesSeurat(obj = SeuratObj, newnames = HGNC.updated.genes)

⑽ Change cluster information

update_cluster_in_seurat_obj <- function(seurat_obj, barcode, cluster){

  # dim(seurat_obj)[1] = length(barcode) = length(cluster)
  mat <- matrix(0, nrow = length(barcode), ncol = 2)
  mat[, 1] = barcode
  mat[, 2] = cluster
  mat = as.data.frame(mat)
  rownames(mat) = barcode

  seurat_obj@meta.data$orig.ident = mat[rownames(seurat_obj@meta.data), 2]
  # you may need to modify the above code
  seurat_obj@active.ident = as.factor(seurat_obj@meta.data$orig.ident)
  
  return (seurat_obj)
}

⑾ Idents allows you to write FindAllMarkers for the metadata you want (reference 1, reference 2)

Idents(pbmc) <- pbmc$celltype
markers = FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)

⑿ Reorder the place within the plot of the orig.ident of the Seurat object

# Reference : https://github.com/satijalab/seurat/issues/2471

object$orig.ident <- factor(x = object$orig.ident, levels = c("A", "B", "C"))

⒀ Save Seurat object pbmc3k as h5ad (reference)

library(Seurat)
library(SeuratData)
library(SeuratDisk)

SaveH5Seurat(pbmc3k, filename = "~/Downloads/pbmc3k.h5Seurat")
Convert("~/Downloads/pbmc3k.h5Seurat", dest = "h5ad")

① The generated .h5ad file can be open with the following code in Python

import scanpy as sc

pbmc = sc.read_h5ad("pbmc3k.h5ad")

⒁ Save Seurat object as h5ad (ver.2)

① The above method no longer works well since the Seurat version updated to 5.0.

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("zellkonverter")
BiocManager::install("SingleCellExperiment")
install.packages("Seurat")  # 필요시

library(Seurat)
library(zellkonverter)
library(SingleCellExperiment)

# 1) RDS 파일 불러오기
seurat_obj <- readRDS("your_seurat_object.rds")

# 2) Seurat -> SCE 변환
sce_obj <- as.SingleCellExperiment(seurat_obj)

# 3) SCE -> h5ad 변환
writeH5AD(sce_obj, file = "output.h5ad")

⒂ When there are matrix.mtx, barcodes.tsv, features.tsv files and a spatial folder within the tissue_dir directory, provide the code for reading Visium data in R.

library(Seurat)

b_data <-ReadMtx('./matrix.mtx.gz',
               './barcodes.tsv.gz',
               './features.tsv.gz',
               feature.column=1)
b_data = CreateSeuratObject(b_data, assay='Spatial')
b_image = Read10X_Image(paste('./spatial/',sep=''))
b_image_ = b_image[Cells(b_data)]
DefaultAssay(object = b_image_) <- 'Spatial'
b_data[["slice1"]]=b_image_
# you can change "slice1"

b_data <- SCTransform(b_data, assay = "Spatial", verbose = FALSE, variable.features.n = 1000)
# Some errors might occur when running SpatialFeaturePlot without SCTransform

SpatialFeaturePlot(b_data, rownames(b_data)[1])
# check whether it works properly

⒃ Provide the code for reading Visium data saved in .h5ad format in R.

library(Seurat)
library(anndata)
library(png)
library(jsonlite)

adata = read_h5ad("adata.h5ad")

hires = adata$uns$spatial$SAMPLE$images$hires
# you must change "SAMPLE"
png_file_path_hires = "~/Downloads/data/spatial/tissue_hires_image.png"
png(filename = png_file_path_hires, width = dim(hires)[1], height = dim(hires)[2])
par(mar = c(0, 0, 0, 0))  # Remove margins
plot.new()
rasterImage(hires, 0, 0, 1, 1)
dev.off()

lowres = adata$uns$spatial$SAMPLE$images$lowres
# you must change "SAMPLE"
png_file_path_lowres = "~/Downloads/data/spatial/tissue_lowres_image.png"
png(filename = png_file_path_lowres, width = dim(lowres)[1], height = dim(lowres)[2])
par(mar = c(0, 0, 0, 0))  # Remove margins
plot.new()
rasterImage(lowres, 0, 0, 1, 1)
dev.off()

scale_factors <- adata$uns$spatial$SAMPLE$scalefactors
# you must change "SAMPLE"
json_file_path <- "~/Downloads/data/spatial/scalefactors_json.json"
write_json(scale_factors, json_file_path)

df = adata$obs[,1:3]
df[,4] = adata$obsm$spatial[,2]
df[,5] = adata$obsm$spatial[,1]
write.csv(df, "~/Downloads/data/spatial/tissue_positions_list.csv")

b_data = CreateSeuratObject(t(as.matrix(adata$X)), assay='Spatial')
b_image = Read10X_Image(paste('~/Downloads/data/spatial/',sep=''))
b_image_ = b_image[Cells(b_data)]
DefaultAssay(object = b_image_) <- 'Spatial'
b_data[["slice1"]]=b_image_
# you can change "slice1"

b_data <- SCTransform(b_data, assay = "Spatial", verbose = FALSE, variable.features.n = 1000)
# Some errors might occur when running SpatialFeaturePlot without SCTransform

SpatialFeaturePlot(b_data, rownames(b_data)[1])
# An error occurs...

⒄ Code for acquiring highly variable genes

library(dplyr)
library(Seurat)

pbmc.data <- Read10X(data.dir = "./outs")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc[["percent.mt"]] <- PercentageFeatureSet(object = pbmc, pattern = "^MT-")
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 15)
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- FindVariableFeatures(object = pbmc, selection.method = "vst", nfeatures = 2000)

gene_list = rownames(pbmc)[pbmc@assays$RNA@meta.features$vst.variable]

⒅ PCA

# reference : https://kkokkilkon.tistory.com/144

#install_github("devtools")
library(devtools)
#install_github("ggbiplot", "vqv")
library(ggbiplot)

PCA <- function(dt, dt_group, scale=T){
  # rownames(dt) : our interest
  # colnames(dt) : the dimensional space of each sample
  
  pca_dt <- prcomp(dt,
                   center = T,
                   scale. = scale)
  
  ggbiplot(pca_dt,
                choices = c(1, 2),
                obs.scale = 1,
                var.scale = 1,
                groups = dt_group,
                circle = TRUE,
					  		varname.size=0,
					  		var.axes = F)
  
}

# Example
dt <- iris[, -5]
dt_group <- iris[, 5]
scale = TRUE
PCA(dt, dt_group, scale)

⒆ Kaplan-Meier Survival Analysis

# Install and Load Required Packages
install.packages("survival")
install.packages("survminer")
library(survival)
library(survminer)

# Create modified data frames
# Accurately reflect all deaths and sensored cases
surv_data <- data.frame(
  time = c(6, 12, 21, 27, 32, 39, 43, 43, 43, 89, 89, 89, 89, 89, 89, 261, 263, 270, 270, 311),
  status = c(1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1) # 사망은 1, censoring은 0
)

# Create a Surv object
surv_obj <- Surv(time = surv_data$time, event = surv_data$status)

# Estimate a Kaplan-Meier survival curve
fit <- survfit(surv_obj ~ 1, data = surv_data)

# Draw a survival curve
ggsurvplot(
  fit, 
  data = surv_data, 
  xlab = "Time", 
  ylab = "Survival probability", 
  title = "Kaplan-Meier Survival Curve",
  surv.median.line = "hv", # Add a median line for the survival curve
  ggtheme = theme_minimal(), # Set a theme 
  risk.table = TRUE, # Add a risk table
  palette = "Dark2" # Set a color palette 
)

8. Plot Generation

⑴ Example of storing images in png

png(file = "my plot.png", width = 1500, height = 300)
DimPlot(pbmc, reduction = "umap", label = TRUE, repel = TRUE)
dev.off()

⑵ A function that draws the confidence interval between the scatter plot and the slope given x and y

scatter_plot <- function(x, y, xlab = "x", ylab = "y", point_size = 2, lab_size = 4, png=TRUE){
  library(ggplot2)

  # the lenth(x) must be same with the length(y)
  mat <- matrix(0, nrow = length(x), ncol = 2)
  mat[, 1] = x
  mat[, 2] = y
  colnames(mat) = c(xlab, ylab)
  mat <- as.data.frame(mat)

  if(png){
    png("./scatter_plot.png",width=2000,height=2000,res=500)
    ggplot(mat, aes(x=x, y=y)) + geom_point(shape=19, size=point_size, color="blue") + theme(plot.background = element_blank(),   panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(size =1)) +   stat_smooth(method = lm, level=.95, color="grey") + labs(x=xlab, y=ylab, size=lab_size)
    dev.off()
  } else{
    ggplot(mat, aes(x=x, y=y)) + geom_point(shape=19, size=point_size, color="blue") + theme(plot.background = element_blank(),   panel.grid.major = element_blank(), panel.grid.minor = element_blank(),panel.background = element_blank(), axis.line = element_line(size =1)) +   stat_smooth(method = lm, level=.95, color="grey") + labs(x=xlab, y=ylab, size=lab_size)
  }
}

⑶ A function that draws a Spatial FeaturePlot when x, y, and color are given

my.plot <- function(x, y, col){
	# assume that length(x) = length(y) = length(col)

  plot(x, y, t="n")
  colfunc <- colorRampPalette(c("#000000", "#EB4600", "#FFF800"))
  
  coll = array(dim = length(col))
  for(i in 1 : length(col)){
    coll[i] <- colfunc(100) [as.integer( col[i] / max(col) * 99 + 1)] 
  }
  
  text(x, y, labels = "●", col = coll, cex = 1)
}

⑷ spatial feature plot

# tissue_dir : the directory that contains a filtered_feature_bc_matrix.h5
tissue_dir <- './outs/' 

# Tgenes : genes of interest
Tgenes <- c('Slc2a1', 'Slc2a3')

conv_spatial_feature_plot <- function(tissue_dir, Tgenes, quality.control = FALSE){
  library(Seurat)
  library(SeuratData)
  library(ggplot2)
  library(cowplot)
  library(dplyr)

  # reference : https://nbisweden.github.io/workshop-scRNAseq/labs/compiled/seurat/seurat_07_spatial.html
  
  br.sp = Load10X_Spatial(tissue_dir, slice= 'slice1')
  br.sp <- SCTransform(br.sp, assay = "Spatial", verbose = FALSE, variable.features.n = 1000)

  if(quality.control){
    br.sp <- PercentageFeatureSet(br.sp, "^mt-", col.name = "percent_mito")
    br.sp <- PercentageFeatureSet(br.sp, "^Hb.*-", col.name = "percent_hb")
    br.sp <- br.sp[, br.sp$nFeature_Spatial > 500 & br.sp$percent_mito < 25 & br.sp$percent_hb < 20]
  }

  SpatialFeaturePlot(br.sp, features = Tgenes)
}

conv_spatial_feature_plot(tissue_dir, Tgenes)

⑸ enhanced volcano plot given gene.list, log FC value vector, and adjusted p value value vector

my.EnhancedVolcano <- function(gene.name, logFC, adj.P.Val, 
                               pCutoff = 0.05, FCcutoff = 0.3,
                               xlim = c(-0.5, 0.5), ylim = c(-0.5, 0.5)){
  # install.packages("BiocManager")
  # BiocManager::install("EnhancedVolcano")
  library(EnhancedVolcano)


  tested <- matrix(0, nrow = length(gene.name), ncol = 2)
  tested <- as.data.frame(tested)
  for(i in 1:length(gene.name)){
    tested[i, 1] <- logFC[i]
    tested[i, 2] <- adj.P.Val[i]
  }
  rownames(tested) <- gene.name
  colnames(tested) <- c('logFC', 'adj.P.Val')

  EnhancedVolcano(tested, lab = rownames(tested), 
                  x='logFC', y='adj.P.Val', xlim = xlim, ylim = ylim, 
                  pCutoff = pCutoff, FCcutoff = FCcutoff) 
}

⑹ How to find the gene ontology (GO) from the gene list

GO <- function(gene){
  library(EnhancedVolcano)
  library(clusterProfiler)
  library(org.Hs.eg.db)
  library(org.Mm.eg.db)
  library(enrichplot)

  # ont = "ALL", "BP", "CC", "MF"
  # showCategory is not mandatory

  gene <- gsub('.*-', '', gene) # -와 그 앞에 있는 것들을 제거 (예 : "GRCh38-")
  gene <- gsub('\\ .*$', '', gene) # 'KLK2 ENSG00000167751' 같은 것을 해결 
    
  if (gene[1] == toupper(gene[1])){ ## Human gene
      gene.df <- bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
      gene.df <- as.vector(gene.df[[2]])
      GO <- enrichGO(gene.df, OrgDb = 'org.Hs.eg.db',keyType = "ENTREZID", ont = "ALL", pvalueCutoff = 0.05, pAdjustMethod = "BH")
      return(GO)
  } else{ ## Mouse gene?
      gene.df <- bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)
      gene.df <- as.vector(gene.df[[2]])
      GO <- enrichGO(gene.df, OrgDb = 'org.Mm.eg.db',keyType = "ENTREZID", ont = "ALL", pvalueCutoff = 0.05, pAdjustMethod = "BH")
      return(GO)
  }
}

⑺ How to draw a gene onology (GO) plot from the gene list (▶ How to Interpret GO Plot)

GO.plot <- function(gene){
  library(EnhancedVolcano)
  library(clusterProfiler)
  library(org.Hs.eg.db)
  library(org.Mm.eg.db)
  library(enrichplot)

  # ont = "ALL", "BP", "CC", "MF"
  # showCategory is not mandatory

  gene <- gsub('.*-', '', gene) # -와 그 앞에 있는 것들을 제거 (예 : "GRCh38-")
  gene <- gsub('\\ .*$', '', gene) # 'KLK2 ENSG00000167751' 같은 것을 해결 
    
  if (gene[1] == toupper(gene[1])){ ## Human gene
      gene.df <- bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
      gene.df <- as.vector(gene.df[[2]])
      GO <- enrichGO(gene.df, OrgDb = 'org.Hs.eg.db',keyType = "ENTREZID", ont = "ALL", pvalueCutoff = 0.05, pAdjustMethod = "BH")
      dotplot(GO,split="ONTOLOGY", showCategory = 5)+facet_grid(ONTOLOGY~., scale="free")
  } else{ ## Mouse gene?
      gene.df <- bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)
      gene.df <- as.vector(gene.df[[2]])
      GO <- enrichGO(gene.df, OrgDb = 'org.Mm.eg.db',keyType = "ENTREZID", ont = "ALL", pvalueCutoff = 0.05, pAdjustMethod = "BH")
      dotplot(GO,split="ONTOLOGY", showCategory = 5)+facet_grid(ONTOLOGY~., scale="free")
  }
}

### Example
GO.plot(c("COL1A1", "COL1A2", "COL3A1", "COL6A3"))

⑻ How to draw a cnetplot from the gene list

my.cnetplot <- function(gene.list){
  GO <- function(gene){
    library(EnhancedVolcano)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    library(org.Mm.eg.db)
    library(enrichplot)
  
    # ont = "ALL", "BP", "CC", "MF"
    # showCategory is not mandatory
  
    gene <- gsub('.*-', '', gene) # -와 그 앞에 있는 것들을 제거 (예 : "GRCh38-")
    gene <- gsub('\\ .*$', '', gene) # 'KLK2 ENSG00000167751' 같은 것을 해결 
      
    if (gene[1] == toupper(gene[1])){ ## Human gene
        gene.df <- bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db)
        gene.df <- as.vector(gene.df[[2]])
        GO <- enrichGO(gene.df, OrgDb = 'org.Hs.eg.db',keyType = "ENTREZID", ont = "ALL", pvalueCutoff = 0.05, pAdjustMethod = "BH")
        return(GO)
    } else{ ## Mouse gene?
        gene.df <- bitr(gene, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Mm.eg.db)
        gene.df <- as.vector(gene.df[[2]])
        GO <- enrichGO(gene.df, OrgDb = 'org.Mm.eg.db',keyType = "ENTREZID", ont = "ALL", pvalueCutoff = 0.05, pAdjustMethod = "BH")
        return(GO)
    }
  }
  
  go = GO(gene.list)
  ego_ <- 0
  if (gene.list[1] == toupper(gene.list[1])){ ## Human gene
    ego_ <- setReadable(go, org.Hs.eg.db, keyType='ENTREZID')
  } else { ## Mouse gene?
    ego_ <- setReadable(go, org.Mm.eg.db, keyType='ENTREZID')
  }
  
  cnetplot(ego_, categorySize="pvalue", foldChange= gene.list )
}

my.cnetplot(c("COL1A1", "COL1A2", "COL3A1", "COL6A3"))

⑼ Violin plot with statistical analysis

VlnPlot(object = br.sp, features = c('Col1a1'),
        group.by = 'orig.ident', pt.size = 0.1) + 
    	facet_grid(.~tnbc.merge@active.ident)+
    	fill_palette(palette='npg')+
    	stat_compare_means(method = "anova", label='p')+
    	theme(axis.text.x = element_text(angle = 90, hjust = 1),
    	strip.text.x = element_text(size = rel(0.7)))

Input: 2022.05.03 00:13

2156

A collection of main functions useful in R

1. Overview

2. Basics

3. File I/O

4. Vectors and Matrices

5. gsub

6. Statistical Analysis

7. Bioinformatics

8. Plot Generation

results matching ""

No results matching ""