Fall23 Barry Grant Bioinformatics
DAR, pID A69026881
#BiocManager::install("DESeq2")
#BiocManager::install("AnnotationDbi")
library(BiocManager)
library(DESeq2)
Loading required package: S4Vectors
Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which.max, which.min
Attaching package: 'S4Vectors'
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
expand.grid, I, unname
Loading required package: IRanges
Loading required package: GenomicRanges
Loading required package: GenomeInfoDb
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats
Attaching package: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
colWeightedMeans, colWeightedMedians, colWeightedSds,
colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
rowWeightedSds, rowWeightedVars
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Attaching package: 'Biobase'
The following object is masked from 'package:MatrixGenerics':
rowMedians
The following objects are masked from 'package:matrixStats':
anyMissing, rowMedians
library(ggplot2)
library(DESeq2)
citation("DESeq2")
To cite package 'DESeq2' in publications use:
Love, M.I., Huber, W., Anders, S. Moderated estimation of fold change
and dispersion for RNA-seq data with DESeq2 Genome Biology 15(12):550
(2014)
A BibTeX entry for LaTeX users is
@Article{,
title = {Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2},
author = {Michael I. Love and Wolfgang Huber and Simon Anders},
year = {2014},
journal = {Genome Biology},
doi = {10.1186/s13059-014-0550-8},
volume = {15},
issue = {12},
pages = {550},
}
counts <- read.csv("~/Downloads/Airway Scaled Counts.csv", row.names = 1)
metadata <- read.csv("~/Downloads/Airway Metadata.csv")
head(counts)
SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
ENSG00000000003 723 486 904 445 1170
ENSG00000000005 0 0 0 0 0
ENSG00000000419 467 523 616 371 582
ENSG00000000457 347 258 364 237 318
ENSG00000000460 96 81 73 66 118
ENSG00000000938 0 0 1 0 2
SRR1039517 SRR1039520 SRR1039521
ENSG00000000003 1097 806 604
ENSG00000000005 0 0 0
ENSG00000000419 781 417 509
ENSG00000000457 447 330 324
ENSG00000000460 94 102 74
ENSG00000000938 0 0 0
head(metadata)
id dex celltype geo_id
1 SRR1039508 control N61311 GSM1275862
2 SRR1039509 treated N61311 GSM1275863
3 SRR1039512 control N052611 GSM1275866
4 SRR1039513 treated N052611 GSM1275867
5 SRR1039516 control N080611 GSM1275870
6 SRR1039517 treated N080611 GSM1275871
Q1: How many genes are in this dataset? - 38,694 genes
Q2: How many ‘control’ cell lines do we have? - 4 control cell lines
I want to compare the control to the treated columns. To do this I will
-Step 1, Identify and extract the “control” columns -Step 2: calculate the mean value per gene for all these “control” columns and save as ‘control.mean’ -Step 3: do the same for treated -Step 4: compare the ‘control.mean’ and ‘treated.mean’ values
Step 1:
control.inds <- metadata$dex=="control"
metadata[control.inds,]
id dex celltype geo_id
1 SRR1039508 control N61311 GSM1275862
3 SRR1039512 control N052611 GSM1275866
5 SRR1039516 control N080611 GSM1275870
7 SRR1039520 control N061011 GSM1275874
head(counts[,control.inds])
SRR1039508 SRR1039512 SRR1039516 SRR1039520
ENSG00000000003 723 904 1170 806
ENSG00000000005 0 0 0 0
ENSG00000000419 467 616 582 417
ENSG00000000457 347 364 318 330
ENSG00000000460 96 73 118 102
ENSG00000000938 0 1 2 0
control.mean <- rowMeans(counts[,control.inds])
head(control.mean)
ENSG00000000003 ENSG00000000005 ENSG00000000419 ENSG00000000457 ENSG00000000460
900.75 0.00 520.50 339.75 97.25
ENSG00000000938
0.75
Step 2:
treated.inds <- metadata$dex=="treated"
metadata[treated.inds,]
id dex celltype geo_id
2 SRR1039509 treated N61311 GSM1275863
4 SRR1039513 treated N052611 GSM1275867
6 SRR1039517 treated N080611 GSM1275871
8 SRR1039521 treated N061011 GSM1275875
head(counts[,treated.inds])
SRR1039509 SRR1039513 SRR1039517 SRR1039521
ENSG00000000003 486 445 1097 604
ENSG00000000005 0 0 0 0
ENSG00000000419 523 371 781 509
ENSG00000000457 258 237 447 324
ENSG00000000460 81 66 94 74
ENSG00000000938 0 0 0 0
treated.mean <- rowMeans(counts[,treated.inds])
head(treated.mean)
ENSG00000000003 ENSG00000000005 ENSG00000000419 ENSG00000000457 ENSG00000000460
658.00 0.00 546.00 316.50 78.75
ENSG00000000938
0.00
Q3: How would you make the above code in either approach more robust? Is there a function that could help here? - the apply function
meancounts <- data.frame(control.mean, treated.mean)
Q5: create a scatter plot Q5b: make it in ggplot - using geom_point()
plot(meancounts)
#in ggploot
ggplot(meancounts, aes(control.mean, treated.mean)) +
geom_point(alpha=0.2)
Q6: Try plotting both axis on a log scale, What is the argument to plot() that allows you to do this? - log=“xy”
plot(meancounts, log="xy")
Warning in xy.coords(x, y, xlabel, ylabel, log): 15032 x values <= 0 omitted
from logarithmic plot
Warning in xy.coords(x, y, xlabel, ylabel, log): 15281 y values <= 0 omitted
from logarithmic plot
#in ggplot
meancounts <- data.frame(control.mean, treated.mean)
plot(meancounts)
ggplot(meancounts, aes(control.mean, treated.mean)) +
geom_point(alpha=0.2) +
scale_x_continuous(trans="log2") +
scale_y_continuous(trans="log2")
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Transformation introduced infinite values in continuous y-axis
Logs are super useful when we have such skewed data
# Treated / control
log2(10/10)
[1] 0
log2(20/10)
[1] 1
log2(5/10)
[1] -1
#new column with log2(fold change)
meancounts$log2fc <- log2(meancounts[,"treated.mean"]/meancounts[,"control.mean"])
head(meancounts)
control.mean treated.mean log2fc
ENSG00000000003 900.75 658.00 -0.45303916
ENSG00000000005 0.00 0.00 NaN
ENSG00000000419 520.50 546.00 0.06900279
ENSG00000000457 339.75 316.50 -0.10226805
ENSG00000000460 97.25 78.75 -0.30441833
ENSG00000000938 0.75 0.00 -Inf
head(meancounts[,1:2]==0)
control.mean treated.mean
ENSG00000000003 FALSE FALSE
ENSG00000000005 TRUE TRUE
ENSG00000000419 FALSE FALSE
ENSG00000000457 FALSE FALSE
ENSG00000000460 FALSE FALSE
ENSG00000000938 FALSE TRUE
head(rowSums(meancounts[,1:2]==0)>0)
ENSG00000000003 ENSG00000000005 ENSG00000000419 ENSG00000000457 ENSG00000000460
FALSE TRUE FALSE FALSE FALSE
ENSG00000000938
TRUE
to.rm.inds <- rowSums(meancounts[,1:2]==0)>0
#shows you ones that are not 0's
head(meancounts[!to.rm.inds, ])
control.mean treated.mean log2fc
ENSG00000000003 900.75 658.00 -0.45303916
ENSG00000000419 520.50 546.00 0.06900279
ENSG00000000457 339.75 316.50 -0.10226805
ENSG00000000460 97.25 78.75 -0.30441833
ENSG00000000971 5219.00 6687.50 0.35769358
ENSG00000001036 2327.00 1785.75 -0.38194109
mycounts <- meancounts[!to.rm.inds,]
#...or you can do it this way
zero.vals <- which(meancounts[,1:2]==0, arr.ind = TRUE)
to.rm <- unique(zero.vals[,1])
mycounts <- meancounts[-to.rm,]
head(mycounts)
control.mean treated.mean log2fc
ENSG00000000003 900.75 658.00 -0.45303916
ENSG00000000419 520.50 546.00 0.06900279
ENSG00000000457 339.75 316.50 -0.10226805
ENSG00000000460 97.25 78.75 -0.30441833
ENSG00000000971 5219.00 6687.50 0.35769358
ENSG00000001036 2327.00 1785.75 -0.38194109
Q. How many genes to I have left?
nrow(mycounts)
[1] 21817
up.ind <- sum(mycounts$log2fc > +2)
up.ind
[1] 250
down.ind <- sum(mycounts$log2fc < (-2))
down.ind
[1] 367
Q8: Using the ‘up.ind’ vector above, can you determine how many upregulated genes we have at greater than 2fc level? - 250
Q9: Using the ‘down.ind’ vector above, can you determine how many downregulated genes we have at greater than 2fc level? - 367
Q10: Do you trust these results? Why? - Not yet, because the differences might not be significant.
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = metadata,
design = ~dex)
converting counts to integer mode
Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
design formula are characters, converting to factors
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
To get the results out of this ‘dds’ object, we can use the DESeq ‘results()’ function.
res <- results(dds)
head(res)
log2 fold change (MLE): dex treated vs control
Wald test p-value: dex treated vs control
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue
<numeric> <numeric> <numeric> <numeric> <numeric>
ENSG00000000003 747.194195 -0.3507030 0.168246 -2.084470 0.0371175
ENSG00000000005 0.000000 NA NA NA NA
ENSG00000000419 520.134160 0.2061078 0.101059 2.039475 0.0414026
ENSG00000000457 322.664844 0.0245269 0.145145 0.168982 0.8658106
ENSG00000000460 87.682625 -0.1471420 0.257007 -0.572521 0.5669691
ENSG00000000938 0.319167 -1.7322890 3.493601 -0.495846 0.6200029
padj
<numeric>
ENSG00000000003 0.163035
ENSG00000000005 NA
ENSG00000000419 0.176032
ENSG00000000457 0.961694
ENSG00000000460 0.815849
ENSG00000000938 NA
A common summary visualization is called a Volcano plot:
mycols <- rep("gray", nrow(res))
mycols[ res$log2FoldChange >2 ] <- "black"
mycols[ res$log2FoldChange < -2 ] <- "black"
mycols[ res$padj > 0.05] <- "red"
plot(res$log2FoldChange, -log(res$padj), col=mycols,
xlab="Log2 Fold-Change", ylab="log P-value")
abline(v=c(-2,2), col="red")
abline(h=-log(0.05), col="blue")
write.csv(res, file="myresults.csv")
#adding annotation data
We need to translate or “map” our ensemble IDs into more understandable gene names and the identifiers that other useful databases use.
library("org.Hs.eg.db")
Loading required package: AnnotationDbi
library(AnnotationDbi)
library(pathview)
##############################################################################
Pathview is an open source software package distributed under GNU General
Public License version 3 (GPLv3). Details of GPLv3 is available at
http://www.gnu.org/licenses/gpl-3.0.html. Particullary, users are required to
formally cite the original Pathview paper (not just mention it) in publications
or products. For details, do citation("pathview") within R.
The pathview downloads and uses KEGG data. Non-academic uses may require a KEGG
license agreement (details at http://www.kegg.jp/kegg/legal.html).
##############################################################################
library(gage)
library(gageData)
data(kegg.sets.hs)
# Examine the first 2 pathways in this kegg set for humans
head(kegg.sets.hs, 2)
$`hsa00232 Caffeine metabolism`
[1] "10" "1544" "1548" "1549" "1553" "7498" "9"
$`hsa00983 Drug metabolism - other enzymes`
[1] "10" "1066" "10720" "10941" "151531" "1548" "1549" "1551"
[9] "1553" "1576" "1577" "1806" "1807" "1890" "221223" "2990"
[17] "3251" "3614" "3615" "3704" "51733" "54490" "54575" "54576"
[25] "54577" "54578" "54579" "54600" "54657" "54658" "54659" "54963"
[33] "574537" "64816" "7083" "7084" "7172" "7363" "7364" "7365"
[41] "7366" "7367" "7371" "7372" "7378" "7498" "79799" "83549"
[49] "8824" "8833" "9" "978"
columns(org.Hs.eg.db)
[1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
[6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME"
[11] "GENETYPE" "GO" "GOALL" "IPI" "MAP"
[16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM"
[21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
[26] "UNIPROT"
res$symbol <- mapIds(org.Hs.eg.db,
keys=row.names(res), #our genenames
keytype="ENSEMBL",
column="SYMBOL",
multiVals="first")
'select()' returned 1:many mapping between keys and columns
res$entrez <- mapIds(org.Hs.eg.db,
keys=row.names(res), #our genenames
keytype="ENSEMBL",
column="ENTREZID",
multiVals="first")
'select()' returned 1:many mapping between keys and columns
res$uniprot <- mapIds(org.Hs.eg.db,
keys=row.names(res), #our genenames
keytype="ENSEMBL",
column="UNIPROT",
multiVals="first")
'select()' returned 1:many mapping between keys and columns
res$genename <- mapIds(org.Hs.eg.db,
keys=row.names(res), #our genenames
keytype="ENSEMBL",
column="GENENAME",
multiVals="first")
'select()' returned 1:many mapping between keys and columns
data("kegg.sets.hs")
head(kegg.sets.hs,2)
$`hsa00232 Caffeine metabolism`
[1] "10" "1544" "1548" "1549" "1553" "7498" "9"
$`hsa00983 Drug metabolism - other enzymes`
[1] "10" "1066" "10720" "10941" "151531" "1548" "1549" "1551"
[9] "1553" "1576" "1577" "1806" "1807" "1890" "221223" "2990"
[17] "3251" "3614" "3615" "3704" "51733" "54490" "54575" "54576"
[25] "54577" "54578" "54579" "54600" "54657" "54658" "54659" "54963"
[33] "574537" "64816" "7083" "7084" "7172" "7363" "7364" "7365"
[41] "7366" "7367" "7371" "7372" "7378" "7498" "79799" "83549"
[49] "8824" "8833" "9" "978"
foldchanges = res$log2FoldChange
names(foldchanges) = res$entrez
head(foldchanges)
7105 64102 8813 57147 55732 2268
-0.35070302 NA 0.20610777 0.02452695 -0.14714205 -1.73228897
keggres = gage(foldchanges, gsets=kegg.sets.hs)
attributes(keggres)
$names
[1] "greater" "less" "stats"
head(keggres$less, 3)
p.geomean stat.mean p.val
hsa05332 Graft-versus-host disease 0.0004250461 -3.473346 0.0004250461
hsa04940 Type I diabetes mellitus 0.0017820293 -3.002352 0.0017820293
hsa05310 Asthma 0.0020045888 -3.009050 0.0020045888
q.val set.size exp1
hsa05332 Graft-versus-host disease 0.09053483 40 0.0004250461
hsa04940 Type I diabetes mellitus 0.14232581 42 0.0017820293
hsa05310 Asthma 0.14232581 29 0.0020045888
pathview(gene.data = foldchanges, pathway.id = "hsa05310")
'select()' returned 1:1 mapping between keys and columns
Info: Working in directory /Users/deliandrea/Desktop/BGGN213/BGGN213_class15_GitHub/class13
Info: Writing image file hsa05310.pathview.png
vsd <- vst(dds,blind=FALSE)
plotPCA(vsd, intgroup=c("dex"))
using ntop=500 top features by variance
#plotting with ggplot2 from scratch
pcaData <- plotPCA(vsd, intgroup=c("dex"))
using ntop=500 top features by variance