Code Snippets

The evaluation of MERFISHtools and the underyling model as published in the corresponding article. (v1.0.0)

library("biomaRt")
ensembl = useMart("ensembl")#,dataset="hsapiens_gene_ensembl", verbose=TRUE)
ensembl = useDataset("hsapiens_gene_ensembl", mart=ensembl)
library("GOstats")
library("GO.db")
library("org.Hs.eg.db")
library("mutoss")

est <- read.table(snakemake@input[[1]], header = TRUE, stringsAsFactors = FALSE)
rownames(est) <- est$feat

# translate gene names to entrez ids
ids <- getBM(attributes = c("hgnc_symbol", "entrezgene"), filters = "hgnc_symbol", values = est$feat, mart = ensembl)
# take the first entrez id in case of ambiguity
ids <- ids[!duplicated(ids$hgnc_symbol), ]
rownames(ids) <- ids$hgnc_symbol
# lookup entrez ids
est$entrez <- ids[est$feat, "entrezgene"]

# define foreground genes
significant <- est$diff_fdr <= 0.05
foreground <- est[significant, ]
if(nrow(foreground) > 0) {
    # define test parameters
    params <- new("GOHyperGParams",
                  geneIds = foreground$entrez,
                  universeGeneIds = est$entrez,
                  ontology = "BP",
                  annotation = "org.Hs.eg",
                  pvalueCutoff = 0.05,
                  conditional = TRUE,
                  testDirection = "over")
    results <- hyperGTest(params)
    print(results)
    goterms <- summary(results)

    # correct for multiple testing. We use Benjamini-Yekuteli here, because the performed tests are strongly dependent
    #by <- BY(goterms$Pvalue, 0.05)
    #goterms$adjPvalue <- by[["adjPValues"]]

    write.table(goterms, file = snakemake@output[["terms"]], row.names = FALSE, quote = FALSE, sep = "\t")

    # associate genes with go terms
    ids <- ids[!is.na(ids$entrezgene), ]
    rownames(ids) <- ids$entrezgene
    genes <- stack(geneIdUniverse(results))
    colnames(genes) <- c("entrezid", "goterm")
    genes <- genes[, c("goterm", "entrezid")]
    genes$gene <- ids[genes$entrezid, "hgnc_symbol"]
    genes$entrezid <- NULL
    genes$cv <- est[genes$gene, "cv_map"]
    genes$fdr <- est[genes$gene, "diff_fdr"]

    write.table(genes, file = snakemake@output[["genes"]], row.names = FALSE, quote = FALSE, sep = "\t")
} else {
    file.create(snakemake@output[["terms"]])
    file.create(snakemake@output[["genes"]])
}

R org.Hs.eg.db GO.db biomaRt GOstats mutoss From line 1 of scripts/go-enrichment.R

Basic RNA-seq analysis for single-end data.

degFile = snakemake@input[['degFile']]

assembly <- snakemake@params[['assembly']]

FC <- snakemake@params[['FC']]

adjp <- snakemake@params[['adjp']]

printTree <- snakemake@params[['printTree']]

library(GO.db)
library(topGO)
library(ggplot2)
library(RColorBrewer)
library(biomaRt)
library(GenomicFeatures)
library(Rgraphviz)

##----------load differentially expressed genes --------#
print("Loading differential expressed gene table")
print(degFile)

if(grepl('txt$|tsv$',degFile)){
    deg <- read.delim(file=degFile,header=TRUE,sep="\t")
}

##---------load correct Biomart------------------------#
print(getwd())
if (assembly == "hg19") {
    organismStr <- "hsapiens"
    geneID2GO <- get(load("./anno/biomaRt/hg19.Ens_75.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.89") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 89!
    geneID2GO <- get(load("./anno/biomaRt/hg38.Ens_89.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.90") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("./anno/biomaRt/hg38.Ens_90.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "mm10.78") {
    organismStr <- "mmusculus"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("./anno/biomaRt/mm10.Ens_78.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "mm10.96") {
    organismStr <- "mmusculus"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("./anno/biomaRt/mm10.Ens_96.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}


##-----------------------------------Functions--------------------------------------#
runGO <- function(geneList,xx=xx,otype,setName){
    setLength       <- sum(as.numeric(levels(geneList))[geneList]) 
    fname           <- paste(Dir, paste(setName, otype, "GO.txt", sep="_"), sep="/")
    GOData          <- new("topGOdata", ontology=otype, allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
    resultFisher    <- runTest(GOData, algorithm = "classic", statistic = "fisher")## statistical test for topGO
    x               <- GenTable(GOData, classicFisher=resultFisher, topNodes=length(names(resultFisher@score)))## make go table for all terms
    x               <- data.frame(x)
    pVal            <- data.frame(pval=signif(resultFisher@score, 6)) ## get unrounded pvalue
    x$enrich        <- x$Significant/x$Expected ## calculate enrichment based on what you expect by chance
    x$p.unround     <- pVal[x$GO.ID,"pval"]## put unrounded pvalue in the table
    x$p.adj         <- signif(p.adjust(x$p.unround, method="BH"), 6)## calculate the adjusted pvalue with Benjamini & Hochberg correction
    x$log.p.adj     <- -log10(x$p.adj) ## convert adjusted p value to -log10 for plot magnitude
    #x$Term.full     <- sapply(x$GO.ID, FUN=function(n){Term(xx[[n]])}) ## get the full term name
    x <- x[order(x$GO.ID),]
    write.table(x, file=fname, sep="\t", col.names=TRUE, quote=FALSE, row.names=FALSE) ## save the table
    ## you can print the tree if you want, but since I keep the list of all of them skip
    if(printTree>0){
        printGraph(GOData,## make the tree for the go data
                   resultFisher,
                   firstSigNodes = 5,
                   fn.prefix = sub("_GO.txt$", "", fname),
                   useInfo = "all",
                   pdfSW = TRUE
                   )
    }
    return(x)  
}

## function to make barplot of -log10 adjusted pvalues colored by enrichment

drawBarplot <- function(go, ontology, setName){
    go <- go[!go$p.adj > 0.01,]
    if(nrow(go)>1){
        #go$Term.full <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term.full,1,50)), "\\,"), `[`, 1)))
        go$Term <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term,1,50)), "\\,"), `[`, 1)))
        print(setName)
        go <- go[with(go, order(p.adj, -enrich)),]
        ## Currently there is a discrepency between xx and x, so we only use Term right now, not Term.full
        #go$Term.full <-factor(paste(go$Term.full), levels=rev(paste(go$Term.full))) ## sort table by adjusted p-value
        go$Term <-factor(paste(go$Term), levels=rev(paste(go$Term))) ## sort table by adjusted p-value
        ptitle <- paste(ontology, setName) ## plot title
        ptitle <- gsub("^.*/","",ptitle)
        pfname <- paste(setName,ontology,"pdf",sep=".")## name of png file
        if(nrow(go) < 20 ){
            toprange <- 1:nrow(go)
        }else{
            toprange <- 1:20
        }
        top <- go[toprange,]
        col <- colorRampPalette(c("white","navy"))(16)   
        pdf(file=paste(Dir, pfname, sep="/"),height=5,width=7)
        print({
           p <- ggplot(top, aes(y=log.p.adj, x=Term, fill=enrich)) + ## ggplot barplot function
               geom_bar(stat="identity",colour="black") +
               ggtitle(ptitle) +
               xlab("") + ylab("-log10(fdr)") +
               scale_fill_gradient(low=col[2], high=col[15], name="enrichment", limits=c(0,ceiling(max(top$enrich))))+
               coord_flip()+
               theme(panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_blank(), 
                     panel.background = element_blank(), axis.line = element_line(colour = "black"))+
               theme(text = element_text(size=8),
                     axis.text.x = element_text(vjust=1,color="black",size=8),
                     axis.text.y = element_text(color="black",size=8),
                     plot.title=element_text(size=10))   
       })
       dev.off()
    }
}

print("get up genes and make geneList")
up <- deg$padj < adjp & deg$log2FoldChange >= log2(FC)
up <- unique(rownames(deg[up,]))
up <- toupper(up)
all <-unique(names(geneID2GO))
up.geneList <-  factor(as.integer(all %in% up))
names(up.geneList) <- all

up.setsize <- sum(as.numeric(levels(up.geneList))[up.geneList])
print("setsize for significant genes") 
up.setsize

adjplabel <- gsub("^0\\.","",adjp)
comparison <- gsub("\\.tsv$|\\.txt$|\\.rda$|\\.RData$","",degFile)

Dir <- sub("$", "/GOterms", dirname(comparison))
if(!(file.exists(Dir))) {
      dir.create(Dir,FALSE,TRUE)
}


if (up.setsize >= 2){

print("make GO table for the up genes")
go.UP.BP <- runGO(geneList=up.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))
drawBarplot(go=go.UP.BP,ontology="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))

}else{
up_out = snakemake@output[[1]]
write.table('No Significant Genes', file=up_out)
}

print("get down genes and make geneList")
dn <- deg$padj < adjp & deg$log2FoldChange <= -log2(FC)
dn <- unique(rownames(deg[dn,]))
dn <- toupper(dn)
all <-unique(names(geneID2GO))
dn.geneList <-  factor(as.integer(all %in% dn))
names(dn.geneList) <- all

dn.setsize <- sum(as.numeric(levels(dn.geneList))[dn.geneList])
print("setsize for significant genes") 
dn.setsize

if(dn.setsize >= 2){

print("make GO table for down genes")
go.DN.BP <- runGO(geneList=dn.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))
print("make barplot for down genes")
drawBarplot(go=go.DN.BP,ontology="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))

}else{
down_out = snakemake@output[[2]]
write.table('No Significant Genes', file=down_out)
}

R ggplot2 GO.db RColorBrewer biomaRt Rgraphviz topGO From line 1 of scripts/runGOforDESeq2.R

This pipeline contains the data processing steps used for Thuy Ngo's 2019 cell-free RNA manuscript

degFile = snakemake@input[['degFile']]

assembly <- snakemake@params[['assembly']]

FC <- snakemake@params[['FC']]

adjp <- snakemake@params[['adjp']]

printTree <- snakemake@params[['printTree']]

library(GO.db)
library(topGO)
library(ggplot2)
library(RColorBrewer)
library(biomaRt)
library(GenomicFeatures)
library(Rgraphviz)

##----------load differentially expressed genes --------#
print("Loading differential expressed gene table")
print(degFile)

deg <- read.delim(file=degFile,header=TRUE,sep="\t")
rownames(deg) <- sub("\\.[0-9]*","",rownames(deg))

##---------load correct Biomart------------------------#
if (assembly == "hg19") {
    organismStr <- "hsapiens"
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg19.Ens_75.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.89") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 89!
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg38.Ens_89.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.90") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg38.Ens_90.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "mm10") {
    organismStr <- "mmusculus"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("./anno/biomaRt/hg38.Ens_90.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}


##-----------------------------------Functions--------------------------------------#
runGO <- function(geneList,xx=xx,otype,setName){
    setLength       <- sum(as.numeric(levels(geneList))[geneList]) 
    fname           <- paste(Dir, paste(setName, otype, "GO.txt", sep="_"), sep="/")
    GOData          <- new("topGOdata", ontology=otype, allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
    resultFisher    <- runTest(GOData, algorithm = "classic", statistic = "fisher")## statistical test for topGO
    x               <- GenTable(GOData, classicFisher=resultFisher, topNodes=length(names(resultFisher@score)))## make go table for all terms
    x               <- data.frame(x)
    pVal            <- data.frame(pval=signif(resultFisher@score, 6)) ## get unrounded pvalue
    x$enrich        <- x$Significant/x$Expected ## calculate enrichment based on what you expect by chance
    x$p.unround     <- pVal[x$GO.ID,"pval"]## put unrounded pvalue in the table
    x$p.adj         <- signif(p.adjust(x$p.unround, method="BH"), 6)## calculate the adjusted pvalue with Benjamini & Hochberg correction
    x$log.p.adj     <- -log10(x$p.adj) ## convert adjusted p value to -log10 for plot magnitude
    #x$Term.full     <- sapply(x$GO.ID, FUN=function(n){Term(xx[[n]])}) ## get the full term name
    x <- x[order(x$GO.ID),]
    write.table(x, file=fname, sep="\t", col.names=TRUE, quote=FALSE, row.names=FALSE) ## save the table
    ## you can print the tree if you want, but since I keep the list of all of them skip
    if(printTree>0){
        printGraph(GOData,## make the tree for the go data
                   resultFisher,
                   firstSigNodes = 5,
                   fn.prefix = sub("_GO.txt$", "", fname),
                   useInfo = "all",
                   pdfSW = TRUE
                   )
    }
    return(x)  
}

## function to make barplot of -log10 adjusted pvalues colored by enrichment

drawBarplot <- function(go, ontology, setName){
    go <- go[!go$p.adj > 0.01,]
    if(nrow(go)>1){
        #go$Term.full <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term.full,1,50)), "\\,"), `[`, 1)))
        go$Term <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term,1,50)), "\\,"), `[`, 1)))
        print(setName)
        go <- go[with(go, order(p.adj, -enrich)),]
        ## Currently there is a discrepency between xx and x, so we only use Term right now, not Term.full
        #go$Term.full <-factor(paste(go$Term.full), levels=rev(paste(go$Term.full))) ## sort table by adjusted p-value
        go$Term <-factor(paste(go$Term), levels=rev(paste(go$Term))) ## sort table by adjusted p-value
        ptitle <- paste(ontology, setName) ## plot title
        ptitle <- gsub("^.*/","",ptitle)
        pfname <- paste(setName,ontology,"pdf",sep=".")## name of png file
        if(nrow(go) < 20 ){
            toprange <- 1:nrow(go)
        }else{
            toprange <- 1:20
        }
        top <- go[toprange,]
        col <- colorRampPalette(c("white","navy"))(16)   
        pdf(file=paste(Dir, pfname, sep="/"),height=5,width=7)
        print({
           p <- ggplot(top, aes(y=log.p.adj, x=Term, fill=enrich)) + ## ggplot barplot function
               geom_bar(stat="identity",colour="black") +
               ggtitle(ptitle) +
               xlab("") + ylab("-log10(fdr)") +
               scale_fill_gradient(low=col[2], high=col[15], name="enrichment", limits=c(0,ceiling(max(top$enrich))))+
               coord_flip()+
               theme(panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_blank(), 
                     panel.background = element_blank(), axis.line = element_line(colour = "black"))+
               theme(text = element_text(size=8),
                     axis.text.x = element_text(vjust=1,color="black",size=8),
                     axis.text.y = element_text(color="black",size=8),
                     plot.title=element_text(size=10))   
       })
       dev.off()
    }
}

print("get up genes and make geneList")
up <- deg$padj < adjp & deg$log2FoldChange >= log2(FC)
up <- unique(rownames(deg[up,]))
all <-unique(names(geneID2GO))
up.geneList <-  factor(as.integer(all %in% up))
names(up.geneList) <- all

up.setsize <- sum(as.numeric(levels(up.geneList))[up.geneList])
print("setsize for significant genes") 
up.setsize

adjplabel <- gsub("^0\\.","",adjp)
comparison <- gsub("\\.tsv$|\\.txt$|\\.rda$|\\.RData$","",degFile)

Dir <- sub("$", "/GOterms", dirname(comparison))
if(!(file.exists(Dir))) {
      dir.create(Dir,FALSE,TRUE)
}

if (up.setsize >=2) {

print("make GO table for the up genes")
#################################
go.UP.BP <- runGO(geneList=up.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))
#go.UP.MF <- runGO(geneList=up.geneList,xx=xx,otype="MF",setName=paste(comparison,"up",sep="."))

print("make the png for the up genes")
drawBarplot(go=go.UP.BP,ontology="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))

} else {
up_out = snakemake@output[[1]]
write.table('No Significant Genes', file=up_out)
}

print("get down genes and make geneList")
dn <- deg$padj < adjp & deg$log2FoldChange <= -log2(FC)
dn <- unique(rownames(deg[dn,]))
all <-unique(names(geneID2GO))
dn.geneList <-  factor(as.integer(all %in% dn))
names(dn.geneList) <- all

dn.setsize <- sum(as.numeric(levels(dn.geneList))[dn.geneList])
print("setsize for significant genes") 
dn.setsize

if(dn.setsize >= 2){

print("make GO table for down genes")
go.DN.BP <- runGO(geneList=dn.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))

print("make barplot for down genes")
drawBarplot(go=go.DN.BP,ontology="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))

}else{
down_out = snakemake@output[[2]]
write.table('No Significant Genes', file=down_out)
}

R ggplot2 GO.db RColorBrewer biomaRt Rgraphviz topGO From line 1 of scripts/runGOforDESeq2.R

Differential Gene Expression using RNA-Seq

library(knitr)
knitr::opts_chunk$set(echo = TRUE)
library(DESeq2)
library(ggplot2) 
library(knitr)
library(clusterProfiler)
library(biomaRt)
library(ReactomePA)
library(DOSE)
library(KEGG.db)
library(org.Mm.eg.db)
library(org.Hs.eg.db)
library(pheatmap)
library(genefilter)
library(RColorBrewer)
library(GO.db)
library(topGO)
library(dplyr)
library(gage)
library(ggsci)

R Markdown ggplot2 dplyr org.Hs.eg.db GO.db org.Mm.eg.db RColorBrewer knitr biomaRt ggsci genefilter topGO clusterProfiler gage DOSE ReactomePA From line 7 of master/README.Rmd

source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2") ; library(DESeq2)
biocLite("ggplot2") ; library(ggplot2)
biocLite("clusterProfiler") ; library(clusterProfiler)
biocLite("biomaRt") ; library(biomaRt)
biocLite("ReactomePA") ; library(ReactomePA)
biocLite("DOSE") ; library(DOSE)
biocLite("KEGG.db") ; library(KEGG.db)
biocLite("org.Mm.eg.db") ; library(org.Mm.eg.db)
biocLite("org.Hs.eg.db") ; library(org.Hs.eg.db)
biocLite("pheatmap") ; library(pheatmap)
biocLite("genefilter") ; library(genefilter)
biocLite("RColorBrewer") ; library(RColorBrewer)
biocLite("GO.db") ; library(GO.db)
biocLite("topGO") ; library(topGO)
biocLite("dplyr") ; library(dplyr)
biocLite("gage") ; library(gage)
biocLite("ggsci") ; library(ggsci)

R Markdown ggplot2 dplyr DESeq2 org.Hs.eg.db GO.db org.Mm.eg.db RColorBrewer pheatmap biomaRt ggsci genefilter topGO clusterProfiler gage DOSE ReactomePA From line 472 of master/README.Rmd

Workflow Steps and Code Snippets

The evaluation of MERFISHtools and the underyling model as published in the corresponding article. (v1.0.0)

Basic RNA-seq analysis for single-end data.

This pipeline contains the data processing steps used for Thuy Ngo's 2019 cell-free RNA manuscript

Differential Gene Expression using RNA-Seq

GO.db