6 Differential gene expression analysis
6.1 Analysis using DESeq2
Here, we are going to perform differential gene expression analysis using DESeq2.
Before this exercise, you are recommended to have basic R programming knowledge and data visualization skill. For that you can refer to my workshop material.
#===============================================================
# Install packages
#===============================================================
# Install bioconductor packages.
<- c(
bioconductor_packages 'DESeq2', 'clusterProfiler',
'biomaRt', 'org.Hs.eg.db',
'org.Mm.eg.db', 'enrichplot'
)
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
::install(bioconductor_packages)
BiocManager
# Install CRAN packages.
<- c(
cran_packages 'tidyverse', 'pheatmap',
'msigdbr', 'RColorBrewer',
'ggrepel'
)install.packages(cran_packages)
#===============================================================
# Load the packages
#===============================================================
library(DESeq2)
library(ggplot2)
library(pheatmap)
library(RColorBrewer)
#===============================================================
# Import data
#===============================================================
# Import gene counts table generated from featureCounts
# - skip first row (general command info)
# - make row names the gene identifiers
<- read.table("final_counts_all.txt",
countdata header = TRUE, skip = 1,
row.names = 1)
head(countdata)
# Remove .bam from column identifiers
colnames(countdata) <- gsub("Aligned.sortedByCoord.out.bam",
"",
colnames(countdata),
fixed = T)
ncol(countdata)
#Take only expression values
<- countdata[ ,c(6:11)]
countdata head(countdata)
#===============================================================
# Convert to matrix
#===============================================================
<- as.matrix(countdata)
countdata head(countdata)
# Assign condition (first three are control,
# second three contain the Knock-Out)
<- factor(c(rep("Control", 3),
condition rep("Knock-Out", 3)))
head(countdata)
#===============================================================
# Prepare DESeqDataSet
#===============================================================
# Create a coldata frame and instantiate the DESeqDataSet.
#See ?DESeqDataSetFromMatrix
<- data.frame(row.names=colnames(countdata),
(coldata
condition))head(coldata)
<- DESeqDataSetFromMatrix(countData=countdata,
dds colData=coldata,
design=~condition)
$condition
dds# Determining the directionality of comparison.
$condition <- relevel(dds$condition, ref = "Control")
dds#===============================================================
# Run the DESeq2 pipeline
#===============================================================
<- DESeq(dds)
dds
dds# Sample level QC by PCA and hierarchical clustering methods
# Transform normalized counts using the rlog transformation
# Transform counts for data visualization
<- rlog(dds, blind=TRUE)
rld #===============================================================
# Principal components analysis (PCA)
#===============================================================
# Plot PCA
# Save the correlation plot
jpeg(filename = "PCA_plot.jpg",
height = 4,width = 6,units = "in",res = 600)
plotPCA(rld, intgroup="condition")+ theme_light()
dev.off()
#===============================================================
#Hierarchical Clustering
#===============================================================
<- assay(rld)
rld_mat # Extract the rlog matrix from the object
<- assay(rld)
rld_mat # Compute pairwise correlation values
<- cor(rld_mat)
rld_cor # Plot heatmap
pheatmap(rld_cor)
<- brewer.pal(6, "Greens")
heat.colors # Save the correlation plot
jpeg(filename = "Correlation_plot.jpg",
height = 4,width = 6,units = "in",res = 600)
pheatmap(rld_cor, color = heat.colors,fontsize = 10,
fontsize_row = 10, height=20)
dev.off()
#===============================================================
# Get differential expression results
#===============================================================
<- results(dds, pAdjustMethod = "fdr", alpha = 0.05)
results head(results)
summary(results)
# Generate MA plot
jpeg("MA_plot.jpg", units="in",
width=7, height=5, res = 600)
plotMA(results)
dev.off()
#===============================================================
# Convert Gene Symbol to multiple IDs
#===============================================================
# Human genome database (Select the correct one)
library(org.Hs.eg.db)
# Add gene full name
$description <- mapIds(x = org.Hs.eg.db,
resultskeys = row.names(results),
column = "GENENAME",
keytype = "SYMBOL",
multiVals = "first")
# Add ENTREZ ID
$entrez <- mapIds(x = org.Hs.eg.db,
resultskeys = row.names(results),
column = "ENTREZID",
keytype = "SYMBOL",
multiVals = "first")
# Add ENSEMBL
$ensembl <- mapIds(x = org.Hs.eg.db,
resultskeys = row.names(results),
column = "ENSEMBL",
keytype = "SYMBOL",
multiVals = "first")
head(results)
# Order by adjusted p-value
<- results[order(results$padj), ]
res
# Merge with normalized count data
<- merge(as.data.frame(counts(dds, normalized=TRUE)),
resdata as.data.frame(res),
by="row.names", sort=FALSE)
head(resdata)
names(resdata)[1] <- "Gene"
head(resdata)
#To remove rows containing NA and write as csv file
library(tidyverse)
<- resdata %>% drop_na()
resdata1 write.csv(resdata1, file="diff_KO_vs_Control.csv",
row.names = F)
# Subset Upregulated and Downregulated genes
<- resdata1 %>%
upreg ::filter(log2FoldChange > 0 & padj < 0.05)
dplyr
<- resdata1 %>%
downreg ::filter(log2FoldChange < 0 & padj < 0.05)
dplyr
write.csv(upreg, file = "KO_upregulated_genes.csv",
row.names = F)
write.csv(downreg, file = "KO_downregulated_genes.csv",
row.names = F)
# Gather Log-fold change and FDR-corrected pvalues from DESeq2 results
# - Change pvalues to -log10 (1.3 = 0.05)
<- data.frame(gene = row.names(res),
data pval = -log10(res$padj),
lfc = res$log2FoldChange)
# Remove any rows that have NA as an entry
<- na.omit(data)
data
# Color the points which are up or down log2(FC=1.5)= 0.58,
# -log10(P-adj=0.05)=1.3
## If fold-change > 0.58 and pvalue > 1.3 (Increased significant)
## If fold-change < 0.58 and pvalue > 1.3 (Decreased significant)
<- mutate(data,
data color = case_when(data$lfc > 0 & data$pval > 1.3 ~ "Increased",
$lfc < 0 & data$pval > 1.3 ~ "Decreased",
data$pval < 1.3 ~ "nonsignificant"))
data
summary(data)
head(data)
# Make a basic ggplot2 object with x-y values
<- ggplot(data, aes(x = lfc, y = pval, color = color))
vol
# Add ggplot2 layers
<- vol+
p geom_point(size = 0.5, alpha = 0.4, na.rm = T) +
scale_color_manual(name = "Directionality",
values = c(Increased = "deepskyblue4",
Decreased = "deepskyblue2",
nonsignificant = "gray80")) +
theme_classic() + # change overall theme
theme(legend.position = "none") + # change the legend
xlab(expression(log[2]("KO" / "Control"))) +
# Change X-Axis label
ylab(expression(-log[10]("adjusted p-value"))) +
# Change Y-Axis label
scale_y_continuous(trans = "log1p")+
# Scale yaxis due to large p-values
geom_hline(yintercept = 1.3,
colour = "red",
linetype="dashed")
# Add p-adj value cutoff horizontal line
#geom_vline(aes(xintercept=0.58),
="gray60",
colour="dashed")+
linetype#geom_vline(aes(xintercept=-0.58),
="gray60",
colour="dashed")+
linetype#xlim(-2.5, 2.5)
#Base vocano plot
p
library(ggrepel)
#If few selected genes need to be annotated in the volcano plot
<- p+ geom_text_repel(data = data %>%
p1 filter(gene %in% c("HES5", "RBPJ",
"MKI67", "AURKB")),
aes(label = gene, x = lfc, y = pval),
box.padding = unit(.7, "lines"),
hjust= 0.30,
segment.color = 'black',
colour = 'black')
#View the volcano plot
p1
#If want to plot top 10 differentially expressed genes
<- p+ geom_text_repel(data=head(data, 10), aes(label=gene),
p2 box.padding = unit(.5, "lines"),
hjust= 0.30,
segment.color = 'black',
max.overlaps = Inf,
colour = 'black')
p2
# Save the volcano plot
ggsave(
"volcano_diff_KO_vs_Control.jpg",
p1,width = 6.00,
height = 5.00,
dpi = 600
)
6.2 Pathway Enrichment Analysis
Pathway enrichment analysis is a statistical method by which we can predict what biological pathways are enriched in a given gene list.
There are two statistical test that can be performed.
- Statistical over-representation test
- Statistical enrichment test
- To know more about these tests you may refer to Nature Protocol
6.2.1 Statistical Over-representation Analysis
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(tidyverse)
library(msigdbr)
# Import the data
<- read.csv(file = "diff_KO_vs_Control.csv",
res header = T, row.names = 1)
<- data.frame(gene = row.names(res),
data pval = -log10(res$padj),
lfc = res$log2FoldChange)
#===============================================================
# GO over-representation analysis (ORA) using enrichGO
#===============================================================
#Filter the genes which are upregulated in KO
<- data %>% dplyr::filter(lfc >0 & pval > 1.3)
geneList
<- enrichGO(gene = geneList$gene,
ego OrgDb = org.Hs.eg.db, # or Org.Hs.eg.db
ont = "ALL",
#one of “BP”, “MF”, “CC” or “ALL”
pAdjustMethod = "fdr",
#one of “bonferroni”, “BH”, “BY”, “fdr”, “none”
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
keyType = "SYMBOL",
#“ENSEMBL”, “ENTREZID”, “SYMBOL”
readable = TRUE)
write.csv(ego@result,
file = "GO_upregulated_clusterprofiler.csv",
row.names = F)
# Plot
jpeg(filename = "Upreg_enrichment.jpg",
width = 8, height = 6, units = "in",
res = 600)
dotplot(ego)+
labs(title = "Functional enrichment of upregulated genes")
dev.off()
# Few other plots
barplot(ego)
upsetplot(ego)
#Filter the genes which are downregulated in KO
<- data %>% dplyr::filter(lfc <0 & pval > 1.3)
geneList_down
<- enrichGO(gene = geneList_down$gene,
ego_down OrgDb = org.Hs.eg.db,
# or Org.Mm.eg.db
ont = "ALL",
#one of “BP”, “MF”, “CC” or “ALL”
pAdjustMethod = "fdr",
#“bonferroni”, “BH”, “BY”, “fdr”, “none”
pvalueCutoff = 0.01,
qvalueCutoff = 0.05,
keyType = "SYMBOL",
#“ENSEMBL”, “ENTREZID”, “SYMBOL”
readable = TRUE)
head(ego_down@result)
write.csv(ego_down@result,
file = "GO_downregulated_clusterprofiler.csv",
row.names = F)
##Plot
jpeg(filename = "Downreg_enrichment.jpg",
width = 8, height = 6, units = "in",
res = 600)
dotplot(ego_down)+
labs(title = "Functional enrichment of downregulated genes")
dev.off()
barplot(ego_down)
upsetplot(ego_down)
#===============================================================
# over-representation analysis (ORA) using MSigDb gene sets
#===============================================================
<- msigdbr(species = "Homo sapiens", category = "C5") %>%
m_ont select(gs_name, gene_symbol)
head(m_ont)
# Upregulated genes functional enrichment analysis
<- enricher(geneList$gene, TERM2GENE=m_ont)
em head(em)
barplot(em)
dotplot(em)
upsetplot(em)
# Downregulated genes functional enrichment analysis
<- enricher(geneList_down$gene, TERM2GENE=m_ont)
ed barplot(ed)
dotplot(ed)
upsetplot(ed)
6.2.2 Statistical Enrichment Analysis
The most used tool for statistical enrichment test is GSEA.
To know more about GSEA you may refer to Nature Protocol
However, here we shall perform GSEA in R which is very easy and fast.
library(clusterProfiler)
library(enrichplot)
library(tidyverse)
library(msigdbr)
<- resdata1 %>%
gene.list ::mutate(Score = -log10(padj)* sign(log2FoldChange))%>%
dplyr::select(symbol, Score)
dplyr
# Make the rank file
<- deframe(gene.list)
ranks head(ranks)
# Set decreasing order
= sort(ranks, decreasing = TRUE)
geneList #===============================================================
# Perform GSEA
#===============================================================
<- GSEA(geneList, TERM2GENE = m_ont)
em2 head(em2)
<- em2@result
gsea_result
# Save the GSEA result
write.csv(gsea_result,
file = "GSEA_Ontology.csv",
row.names = F)
# Save the GSEA Plot
jpeg(filename = "GSEA_plot.jpg",
width = 10, height = 6, units = "in",
res = 600)
gseaplot2(em2, geneSetID = 2, title = em2$Description[2])
dev.off()
6.2.3 Divergent Lollipop Chart
library(tidyverse)
library(msigdbr)
library(clusterProfiler)
library(enrichplot)
#===============================================================
#Import the data
#===============================================================
<- read.csv(file = "diff_KO_vs_Control.csv",
res header = T)
head(res)
<- res %>%
gene.list ::mutate(Score = -log10(padj)* sign(log2FoldChange))%>%
dplyr::select(symbol, Score)
dplyr#Make the rank file
<- deframe(gene.list)
ranks head(ranks)
#Download MSigDb ontology gene sets
#===============================================================
<- msigdbr(species = "Homo sapiens",
m_ont category = "C5") %>%
select(gs_name, gene_symbol)
head(m_ont)
#Download MSigDb Cell Type Signature gene sets
#===============================================================
<- msigdbr(species = "Homo sapiens",
m_cell category = "C8") %>%
select(gs_name, gene_symbol)
head(m_cell)
#decreasing order
= sort(ranks, decreasing = TRUE)
geneList
#Perform GSEA using ONTOLOGY gene sets
#===============================================================
<- GSEA(geneList, TERM2GENE = m_ont)
em2 head(em2@result)
#Perform GSEA using CELL TYPE gene sets
#===============================================================
<- GSEA(geneList, TERM2GENE = m_cell)
em3 head(em3@result)
# Data wrangling
#===============================================================
<- em3@result %>%
celltype_pos mutate(gene_set = "MSigDB Cell Type Signature Gene Set")%>%
top_n(n = 5, wt = NES)
<- em3@result %>%
celltype_neg mutate(gene_set = "MSigDB Cell Type Signature Gene Set")%>%
top_n(n = 5, wt = -NES)
<- em2@result %>%
ont_pos mutate(gene_set = "MSigDB Gene Ontology Gene Set")%>%
top_n(n = 5, wt = NES)
<- em2@result %>%
ont_neg mutate(gene_set = "MSigDB Gene Ontology Gene Set")%>%
top_n(n = 5, wt = -NES)
<- rbind(celltype_pos, celltype_neg)
celltype_merge <- rbind(ont_pos, ont_neg)
GO_merge <- rbind(celltype_merge, GO_merge) |>
my_data mutate(ID = fct_reorder(ID, NES))
# Round up NES values upto 3 digits
$NES <- round(my_data$NES, 3)
my_datahead(my_data)
#Divergent lollipop chart
#===============================================================
<- ggplot(my_data,
p aes(x = ID,
y = NES))+
geom_segment(aes(y = 0,
x = ID,
xend = ID,
yend = NES),
color = "skyblue")+
geom_point(stat = "identity",
aes(size = abs(NES),
fill = p.adjust),
shape = 21)+
coord_flip()+
scale_fill_viridis_b()+
theme_light()+
theme(panel.border = element_blank(),
panel.grid.minor.x = element_blank(),
axis.ticks = element_blank(),
strip.background = element_rect(fill = "skyblue"),
strip.text = element_text(color = "#4a235a"))+
labs(title = "Comparison of KO vs Control",
subtitle = "Gene Set Enrichment Analysis",
size = "NES",
x = "")+
facet_grid(gene_set ~.,space="free", scales="free")
p
#Save the file
#===============================================================
jpeg(filename = "KO_GSEA_chart.jpg",
height = 8, width = 14,
units = "in", res = 600)
pdev.off()