Bonus code
Bonus code
The following code was added thanks to questions from course participants of past sessions. They might be useful for you too.
Gene label conversions
It is often useful to convert different types of gene labels even further. Here is an example for gene label conversion and gene information extraction using biomaRt. A lot of information for each gene can be obtained, such as chromosome location, description, biotype, or the symbols of mouse homologs of human genes, etc.
# install and load the package:
BiocManager::install("biomaRt")
library(biomaRt)
# list the available options
listEnsembl()
ensembl <- useEnsembl(biomart = "genes")
# List the available species and reference genomes:
datasets <- listDatasets(ensembl)
head(datasets)
# search for human dataset and genome version:
datasets[grep("sapiens", datasets$dataset),]
# 80 hsapiens_gene_ensembl Human genes (GRCh38.p13) GRCh38.p13
# To obtain information on human genes, first the data specific to human has to be accessed from the online Ensembl repository
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
# you can extract gene biotype, description, wiki gene description,
# chromosome location of genes, even strand information etc
# using the listAttributes function allows you to view the type of information per gene you can extract.
attributes <- listAttributes(ensembl)
attributes[1:15,]
# Now we extract symbol (external_gene_name), description, gene_biotype (eg whether it is a protein coding or other type of gene):
biomart_gene_info <- getBM(attributes=c("ensembl_gene_id", "external_gene_name", "description", "gene_biotype", "wikigene_description"),
filter="ensembl_gene_id",
values=NK_vs_Th$ensembl_gene_id,
mart=ensembl)
# check the structure of the resulting data frame:
head(biomart_gene_info)
# search for the oncogene key word in the description:
biomart_gene_info[grep("oncogene", biomart_gene_info$description),]
biomart_gene_info[grep("tumor protein", biomart_gene_info$description),]
# search for genes that have TP53 in their gene symbol:
biomart_gene_info[grep("TP53", biomart_gene_info$external_gene_name),]
For conversion of human gene symbols to mouse homologs for example (or vice versa if you provide mouse genes as the “values” arguments and “hsapiens_homolog_associated_gene_name” in the list of the attributes argument), you can also use biomaRt.
# Convert human genes to mouse homologs
ensembl_human_to_mouse <- getBM(attributes=c("ensembl_gene_id","external_gene_name", "mmusculus_homolog_associated_gene_name"),
filter="ensembl_gene_id",
values=NK_vs_Th$ensembl_gene_id,
mart=ensembl)
head(ensembl_human_to_mouse)
# ensembl_gene_id external_gene_name mmusculus_homolog_associated_gene_name
# 1 ENSG00000000003 TSPAN6 Tspan6
# 2 ENSG00000000419 DPM1 Dpm1
# 3 ENSG00000000419 DPM1 Gm20716
# 4 ENSG00000000457 SCYL3 Scyl3
# 5 ENSG00000000460 C1orf112 BC055324
# 6 ENSG00000000938 FGR Fgr
msigdbr package
The msigdbr package hosted on CRAN allows to access gene set collections hosted on MSigDB directly within R. Check out its vignette to view how to download collections for other species such as mouse, zebra fish, horse, etc. The function msigdbr_species()
allows you to list available species.
For example, to download the Hallmark collection with human gene symbols within R:
gmt <- msigdbr::msigdbr(species = "human", category = "H")
# Create the 2 column-format (TERM2GENE argument) required by clusterProfiler:
h_gmt <- gmt[,c("gs_name", "gene_symbol")]
To determine what other species are available with msigdbr:
msigdbr_species()
#> # A tibble: 20 × 2
#> species_name species_common_name
#> <chr> <chr>
#> 1 Anolis carolinensis Carolina anole, green anole
#> 2 Bos taurus bovine, cattle, cow, dairy cow, domestic cat…
#> 3 Caenorhabditis elegans NA
#> 4 Canis lupus familiaris dog, dogs
#> 5 Danio rerio leopard danio, zebra danio, zebra fish, zebr…
#> 6 Drosophila melanogaster fruit fly
#> 7 Equus caballus domestic horse, equine, horse
#> 8 Felis catus cat, cats, domestic cat
#> 9 Gallus gallus bantam, chicken, chickens, Gallus domesticus
#> 10 Homo sapiens human
#> 11 Macaca mulatta rhesus macaque, rhesus macaques, Rhesus monk…
#> 12 Monodelphis domestica gray short-tailed opossum
#> 13 Mus musculus house mouse, mouse
#> 14 Ornithorhynchus anatinus duck-billed platypus, duckbill platypus, pla…
#> 15 Pan troglodytes chimpanzee
#> 16 Rattus norvegicus brown rat, Norway rat, rat, rats
#> 17 Saccharomyces cerevisiae baker's yeast, brewer's yeast, S. cerevisiae
#> 18 Schizosaccharomyces pombe 972h- NA
#> 19 Sus scrofa pig, pigs, swine, wild boar
#> 20 Xenopus tropicalis tropical clawed frog, western clawed frog
For example, we can download the Gene Ontology Biological Process (GO:BP) collection with yeast gene symbols. To first check the available collections with msigdbr, use ``msigdbr_collections()```.
print(msigdbr_collections(), n=20)
# A tibble: 23 × 3
# gs_cat gs_subcat num_genesets
# <chr> <chr> <int>
# 1 C1 "" 299
# 2 C2 "CGP" 3384
# 3 C2 "CP" 29
# 4 C2 "CP:BIOCARTA" 292
# 5 C2 "CP:KEGG" 186
# 6 C2 "CP:PID" 196
# 7 C2 "CP:REACTOME" 1615
# 8 C2 "CP:WIKIPATHWAYS" 664
# 9 C3 "MIR:MIRDB" 2377
# 10 C3 "MIR:MIR_Legacy" 221
# 11 C3 "TFT:GTRD" 518
# 12 C3 "TFT:TFT_Legacy" 610
# 13 C4 "CGN" 427
# 14 C4 "CM" 431
# 15 C5 "GO:BP" 7658
# 16 C5 "GO:CC" 1006
# 17 C5 "GO:MF" 1738
# 18 C5 "HPO" 5071
# 19 C6 "" 189
# 20 C7 "IMMUNESIGDB" 4872
# Obtain the C5 category and GO:BP subcategory for yeast:
gmt <- msigdbr::msigdbr(species = "Saccharomyces cerevisiae", category = "C5", subcategory = "GO:BP")
# We obtained the GO:BP gene sets, the yeast gene symbols, and yeast ensembl_id:
head(gmt[,c("gs_name", "gene_symbol", "ensembl_gene")], n=10)
# A tibble: 10 × 3
# gs_name gene_symbol ensembl_gene
# <chr> <chr> <chr>
# 1 GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS ADE3 YGR204W
# 2 GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS ADE3 YGR204W
# 3 GOBP_10_FORMYLTETRAHYDROFOLATE_METABOLIC_PROCESS MIS1 YBR084W
# 4 GOBP_2FE_2S_CLUSTER_ASSEMBLY BOL2 YGL220W
# 5 GOBP_2FE_2S_CLUSTER_ASSEMBLY GRX4 YER174C
# 6 GOBP_2FE_2S_CLUSTER_ASSEMBLY GRX5 YPL059W
# 7 GOBP_2FE_2S_CLUSTER_ASSEMBLY JAC1 YGL018C
# 8 GOBP_2FE_2S_CLUSTER_ASSEMBLY NFS1 YCL017C
# 9 GOBP_2_OXOGLUTARATE_METABOLIC_PROCESS ARO8 YGL202W
# 10 GOBP_2_OXOGLUTARATE_METABOLIC_PROCESS ADH4 YGL256W
# If your DE genes are also labeled with gene symbol, create the 2 column-format (TERM2GENE argument) required by clusterProfiler for the GSEA() function for example:
y_gmt <- gmt[,c("gs_name", "gene_symbol")]
We can also use the msigdbr package to download the KEGG collection, in case the gseKEGG()
function of the clusterProfiler package is blocked by firewall issues.
### ---- KEGG with msigdbr download:
msigdbr_collections() #
# C2 "CP:KEGG"
kegg_pw<-msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG")
head(kegg_pw[,c("gs_name", "gene_symbol")])
# # A tibble: 6 × 2
# gs_name gene_symbol
# <chr> <chr>
# 1 KEGG_ABC_TRANSPORTERS ABCA1
# 2 KEGG_ABC_TRANSPORTERS ABCA10
# 3 KEGG_ABC_TRANSPORTERS ABCA12
# 4 KEGG_ABC_TRANSPORTERS ABCA13
# 5 KEGG_ABC_TRANSPORTERS ABCA2
# 6 KEGG_ABC_TRANSPORTERS ABCA3
# Run with GSEA function:
kegg_gsea<-GSEA(gl, eps = 0, TERM2GENE = kegg_pw[,c("gs_name", "gene_symbol")], seed = TRUE)
head(kegg_gsea@result[,c(2:6)])
# Description setSize enrichmentScore NES pvalue
# KEGG_RIBOSOME KEGG_RIBOSOME 86 -0.9016245 -3.534228 9.828038e-49
# KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY 104 0.6183274 2.509621 7.254908e-13
# KEGG_REGULATION_OF_ACTIN_CYTOSKELETON KEGG_REGULATION_OF_ACTIN_CYTOSKELETON 170 0.4547466 1.980859 2.337476e-07
# KEGG_CHEMOKINE_SIGNALING_PATHWAY KEGG_CHEMOKINE_SIGNALING_PATHWAY 161 0.4463380 1.928426 4.354445e-07
# KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS 87 0.5343886 2.100787 1.599037e-06
# KEGG_B_CELL_RECEPTOR_SIGNALING_PATHWAY KEGG_B_CELL_RECEPTOR_SIGNALING_PATHWAY 70 0.5299578 1.993277 8.000155e-06
kegg_gsea@result[grep("NATURAL_KILLER", kegg_gsea@result$Description), c(2:6)]
# Description setSize enrichmentScore NES pvalue
# KEGG_NATURAL_KILLER_CELL_MEDIATED_CYTOTOXICITY 104 0.6183274 2.509621 7.254908e-13
Code for barplots with ggplot2
Barplot of the 10 most significant gene sets:
library(tidyverse)
GO_NK_Th@result %>%
dplyr::arrange(p.adjust) %>%
slice_head(n = 10) %>%
ggplot(aes(x = -log10(p.adjust), y = reorder(Description, -p.adjust))) +
geom_bar(stat = "identity") +
geom_vline(xintercept = -log10(0.05), linetype = "dashed") +
labs(y = "Description") +
theme_classic()
Barplot of NES colored according to direction:
sorted_GO_NK_Th %>%
dplyr::group_by(color) %>%
dplyr::arrange(desc(abs(NES))) %>%
slice_head(n = 10) %>%
ggplot(aes(x = NES, y = reorder(Description, NES), fill = color)) +
geom_bar(stat = "identity") +
geom_vline(xintercept = 0) +
labs(y = "Description") +
theme_classic() +
theme(legend.position = "none")
Code for a lollipop plot with ggplot2
In this lollipop of GSEA results of leukocyte-related GO pathways, the color will represent up- (red) or down-regulated (blue) gene sets, the dot size represents the setSize. The length of the gene set description is truncated to 50 characters. We use the GSEA results that we obtained after selecting the leukocyte-related GO pathways at the end of exercise 3.
# Bonus: lollipop plot of p-values of leukocyte-related GO pathways. The
# color will represent up- (red) or down-regulated (blue) gene sets,
# the dot size represents the setSize.
# The length of the gene set description is truncated to 50 characters:
# We use the GSEA results that we obtained after selecting the leukocyte-related GO pathways at the end of exercise 3
df <- GO_NK_Th_selection@result
head(df[,1:8])
# ID Description setSize enrichmentScore NES pvalue p.adjust qvalue
# GO:0097529 GO:0097529 myeloid leukocyte migration 182 0.3358543 1.459329 0.0060333962 0.04957978 0.03849908
# GO:0030595 GO:0030595 leukocyte chemotaxis 186 0.3314487 1.446607 0.0057064461 0.04766950 0.03701573
# GO:0007159 GO:0007159 leukocyte cell-cell adhesion 335 0.2955485 1.381506 0.0037782543 0.03815146 0.02962490
# GO:0001776 GO:0001776 leukocyte homeostasis 99 -0.3891370 -1.552210 0.0037570540 0.03812616 0.02960525
# GO:0002685 GO:0002685 regulation of leukocyte migration 185 0.3418586 1.489716 0.0020160142 0.02593948 0.02014220
# GO:0070661 GO:0070661 leukocyte proliferation 273 0.3247110 1.493837 0.0009132421 0.01545176 0.01199841
df$mycolor <- ifelse(df$NES<0, "cornflowerblue","indianred2")
df <- df[order(df$pvalue, decreasing = T),] #revert the order of the rows
df$Label <-stringr::str_trunc(df$Description, 50, "right") # keep max 50 character in a string (nchar("string") to count characters)
df$Label <- factor(df$Label, levels = c(df$Label))
# Use limits of the x-axis range according to the range of p.values:
max_x <- round(max(-log10(df$p.adjust)), digits = 0) + 0.5
xlimits <- c(0, max_x)
xbreaks <- seq_along(0:max_x)
# Create the plot:
p <- ggplot(df, aes(x = Label, y = -log10(p.adjust), fill = mycolor)) +
geom_segment(aes(x = Label, xend = Label, y = 0, yend = -log10(p.adjust)),
color = df$mycolor, lwd = 1) +
geom_point(pch = 21, bg = df$mycolor, aes(size=df$setSize), color=df$mycolor) +
scale_size(name = "Number\nof genes") +
scale_y_continuous(name=expression("-"*"log"[10]*"(p-value)"),
limits=xlimits,
breaks=xbreaks) +
scale_x_discrete(name="") +
theme_bw(base_size=10, base_family = "Helvetica") +
theme(axis.text=element_text(size=12, colour = "black"),
axis.title=element_text(size=14)) +
ggtitle("Leukocyte-related GO pathways") +
coord_flip()
p
You will obtain the following lollipop plot :
Code for a heatmap of p-values with ggplot2
For heatmaps, ggplot2 can also be used. Here is an example for a heatmap of the p-values of 6 different gene sets (gs), and the p-value for each gene set was calculated in two comparisons, so we compared the enrichment in genes differentially expressed between NK cells and Th cells, and between NK cells and CD8 T cells (note that this is a dummy example just to show you the example of the code, it is not based on real RNA seq data). Before using ggplot2, you need to create a dataframe that contains a column with the gene set identities, a column with the name of the cell type comparisons, and a column with the p-value of each gene set in each comparison. So a specific format is required for ggplot2.
# Create a data frame that contains the p-value for every gene set for every
# cell type comparison. You need to include the values also for the non-significant
# gene sets. If you use function of the clusterProfiler package, you will usually
# obtain results in the @result slot only for significant gene sets. But if you need
# p-values also for the non-significant gene sets, you can change the argument
# pvalueCutoff = 1 in the gseGO function (and other functions of the clusterProfiler
# package)
# For the example, create a dummy data frame with the list of gene sets (gs), the
# 2 cell type comparisons, and the p-value for each gene set in each comparison:
ora_to_plot<-as.data.frame(cbind(ID=c("gs1", "gs2","gs3","gs4", "gs5", "gs6",
"gs1", "gs2","gs3","gs4", "gs5", "gs6"),
comparison=c(rep("NK_vs_Th",6), rep("NK_vs_CD8",6)),
p_val=as.numeric(c(0.8,0.9,0.6, 0.054, 0.00001, 0.0003,
0.0002, 0.004, 0.001,0.01, 0.85,0.9))))
# transform the p-val to -log10(p-val):
ora_to_plot$log10_p_val<--log10(as.numeric(ora_to_plot$p_val))
# set the none-significant p-values to 0 so that they appear grey in the heatmap:
ora_to_plot$log10_p_val<-ifelse(ora_to_plot$log10_p_val<(-log10(0.05)), 0, ora_to_plot$log10_p_val)
range(ora_to_plot$log10_p_val) # [1] 0 5
head(ora_to_plot, n=12)
# ID comparison p_val log10_p_val
# 1 gs1 NK_vs_Th 0.8 0.000000
# 2 gs2 NK_vs_Th 0.9 0.000000
# 3 gs3 NK_vs_Th 0.6 0.000000
# 4 gs4 NK_vs_Th 0.054 0.000000
# 5 gs5 NK_vs_Th 1e-05 5.000000
# 6 gs6 NK_vs_Th 3e-04 3.522879
# 7 gs1 NK_vs_CD8 2e-04 3.698970
# 8 gs2 NK_vs_CD8 0.004 2.397940
# 9 gs3 NK_vs_CD8 0.001 3.000000
# 10 gs4 NK_vs_CD8 0.01 2.000000
# 11 gs5 NK_vs_CD8 0.85 0.000000
# 12 gs6 NK_vs_CD8 0.9 0.000000
# create a palette of colors based on the Plasma palette (grDevices package)
plot(1:20, 1:20, col=hcl.colors(20,"Plasma"), pch=15, cex=3)
hcl.colors(9,"Plasma")# "RdYlGn")
breaks<-seq(from=0, to=5, by=0.5 )
color<-c("grey78", rev(hcl.colors(9,"Plasma")))
# create plot and export as png:
p<-ggplot(ora_to_plot, aes(comparison, ID, fill= log10_p_val)) +
geom_tile(color = "black") +
scale_fill_gradientn(breaks= breaks,
colors = color) +
theme_bw()
ggsave(plot = p, filename = "heatmap_p_value_ORA.png",
device="png",
width = 4,height = 6)
You will obtain the following heatmap:
Small information on a typical RNAseq workflow
Example for DE analysis with DESeq2
# DGE example with DESeq2:
# R version 4.1.2 (2021-11-01)
BiocManager::install("DESeq2")
library(DESeq2) # v 1.34.0
# setwd("path/to/downloadedData")
counts_NK_Th<-read.csv("htseq_counts_NK_Th.csv", row.names = 1, header = T)
counts_NK_Th<-counts_NK_Th[-c(which(rowSums(counts_NK_Th)==0)),]
dim(counts_NK_Th)
# [1] 38573 15
# build a sample metadata table:
coldata<-as.data.frame(cbind(cell_type=c(rep("NK", 6),
rep("Th", 9)),
donor=sapply(strsplit(colnames(counts_NK_Th), "_"), '[',1),
sample_id=colnames(counts_NK_Th)))
coldata$cell_type<-as.factor(coldata$cell_type)
coldata$cell_type<-factor(coldata$cell_type,
levels=levels(coldata$cell_type)[c(2,1)])
head(coldata)
# cell_type donor sample_id
# 1 NK S15 S15_NK_CD56dim
# 2 NK S15 S15_NK_CD56bright
# 3 NK S16 S16_NK_CD56dim
# 4 NK S16 S16_NK_CD56bright
# 5 NK S17 S17_NK_CD56dim
# 6 NK S17 S17_NK_CD56bright
# Create DESeq object:
dds <- DESeqDataSetFromMatrix(countData = counts_NK_Th,
colData = coldata,
design= ~ donor + cell_type) # Difference between cell types, accounting for the sample pairing
dds <- DESeq(dds)
resultsNames(dds) # lists the coefficients
# [1] "Intercept" "donor_S16_vs_S15" "donor_S17_vs_S15" "cell_type_NK_vs_Th"
deseq2_NK_vs_Th <- as.data.frame(results(dds,
alpha=0.05,
contrast=c("cell_type","NK","Th"),
cooksCutoff=F)) # use cooksCutoff=F only if some genes of interest do not have a calculated p-value
# author recomendation is to use default cooksCutoff=T
head(deseq2_NK_vs_Th)
# baseMean log2FoldChange lfcSE stat pvalue padj
# TSPAN6 40.851111 -6.9583034 1.1044621 -6.30017417 2.973114e-10 8.742152e-09
# TNMD 0.104281 -0.1228379 3.1336380 -0.03919977 9.687311e-01 NA
# DPM1 2566.964652 -0.1466129 0.2211112 -0.66307338 5.072836e-01 7.133950e-01
# SCYL3 571.791633 0.5728065 0.4039864 1.41788547 1.562242e-01 3.515869e-01
# C1orf112 201.504414 0.8758449 0.5938469 1.47486651 1.402484e-01 3.263445e-01
# FGR 8793.900467 8.5188295 1.2025099 7.08420757 1.398422e-12 5.868549e-11
deseq2_NK_vs_Th[grep("CPS1", rownames(deseq2_NK_vs_Th)),]
# baseMean log2FoldChange lfcSE stat pvalue padj
# CPS1 2.34916186 -3.56324252 1.855768 -1.92009033 0.05484649 0.1702518
# CPS1.IT1 0.09375824 -0.08381473 3.134376 -0.02674048 0.97866673 NA
deseq2_NK_vs_Th[grep("GZMB", rownames(deseq2_NK_vs_Th)),]
# baseMean log2FoldChange lfcSE stat pvalue padj
# GZMB 27758.47 9.075387 0.8897603 10.19981 1.986563e-24 2.65665e-22