Snakemake pipeline for Gene Set Enrichment: pathway and GO enrichment analysis

public public 1yr ago 0 bookmarks

Snakemake Workflow for Gene Set Enrichment: Pathway and GO Analysis

Author: Sherine Awad

Change the config.yaml file appropriately according to your data. Update parameters of Genrich in the config file. Also, change workdir where the reference genome, etc.

Then run: snakemake -jnumber_of_cores, for example for 5 cores use:

snakemake -j5

and for a dry run use:

snakemake -j1 -n

and to print the commands in a dry run use:

snakemake -j1 -n -p

For the sake reproducibility, use conda to pull same versions of tools. Snakemake and conda have to be installed in your system:

snakemake --cores --use-conda

TO DO

  1. Add more Genome for enrichment

  2. Add more parameters for a genric pipeline

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library("edgeR")
library(biomaRt)
library(dplyr)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(data.table)
library("gplots")
library(magrittr)
library(pathview)
library("org.Mm.eg.db")
library(gage)
library(gageData)
library(data.table)


args <- commandArgs(trailingOnly = TRUE)
dge = args[1]
ORGANISM  =args[2] 
compare_type = args[3]
outname = args[4]

res <- read.csv(file = dge, header = TRUE)
summary(res) 
foldchanges = res$logFC
names(foldchanges) = res$entrez
outname = paste(outname, compare_type, sep ="-")
outGO = paste(outname, "GO.csv", sep ="-")

if (ORGANISM == "HUMAN")
{
data(kegg.sets.hs)
data(go.sets.hs)
data(carta.hs)
data(sigmet.idx.hs)
data(go.subs.hs)
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
head(kegg.sets.hs,3)
#GO
data(go.sets.hs)
data(go.subs.hs)
gobpsets = go.sets.hs[go.subs.hs$BP]
gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =TRUE, compare =compare_type,cutoff=0.05)
lapply(gobpres, head)
write.csv(gobpres, outGO)

} else if (ORGANISM == "MOUSE"){
data(kegg.sets.mm)
data(go.sets.mm)
data(carta.mm)
data(sigmet.idx.mm)
data(go.subs.mm)


kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm]
head(kegg.sets.mm,3)
data(go.sets.mm)
data(go.subs.mm)
gobpsets = go.sets.mm[go.subs.mm$BP]
gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =TRUE, compare =compare_type,cutoff=0.05)
lapply(gobpres, head)
write.csv(gobpres, outGO)
} 
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
library("edgeR")
library(biomaRt)
library(dplyr)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(data.table)
library("gplots")
library(magrittr)
library(pathview)
library("org.Mm.eg.db")
library(gage)
library(gageData)
library(data.table)


args <- commandArgs(trailingOnly = TRUE)
dge = args[1]
ORGANISM  =args[2] 
compare_type = args[3]
outname = args[4]
outname 

res <- read.csv(file = dge, header = TRUE)
outname = paste(outname, compare_type, sep ="-")
out = paste(outname, "KEGG.csv", sep="-")
outUP = paste(outname, "KEGG_UP.csv", sep ="-")
outDOWN = paste(outname, "KEGG_DOWN.csv", sep ="-")

foldchanges = res$logFC
names(foldchanges) <- res$entrez
summary(foldchanges)
#---------------------------
#KEGG Analysis 
#---------------------------
if (ORGANISM == "HUMAN")
{
data(kegg.sets.hs)
data(go.sets.hs)
data(carta.hs)
data(sigmet.idx.hs)
data(go.subs.hs)
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
head(kegg.sets.hs,3)
#---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways
data(kegg.gs)
keggres = gage(foldchanges, gsets =kegg.sets.hs, same.dir = TRUE, compare=compare_type,make.plot = TRUE)
lapply(keggres, head)
write.csv(keggres,out)
keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
write.csv(keggrespathwaysup, outUP)
write.csv(keggrespathwaysdn, outDOWN)
#-------------------------------------------------------------------------------------------------------------------------------
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsup
keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8)
#gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =FALSE, compare ="paired",cutoff=0.05)
#lapply(gobpres, head)
#---------------------------------------------------------Define plotting function for applying later
plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human", new.signature=FALSE)
#---------------------------------------------------------plot multiple pathways ups and downs 
tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human"))
tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human"))
} else if (ORGANISM == "MOUSE"){
data(kegg.sets.mm)
data(go.sets.mm)
data(carta.mm)
data(sigmet.idx.mm)
data(go.subs.mm)
kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm]
head(kegg.sets.mm,3)
#---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways
data(kegg.gs)
keggres = gage(foldchanges, gsets =kegg.sets.mm, same.dir = TRUE, compare=compare_type,make.plot = TRUE)
lapply(keggres, head)
write.csv(keggres,out)
keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
write.csv(keggrespathwaysup, outUP)
write.csv(keggrespathwaysdn, outDOWN)
#-------------------------------------------------------------------------------------------------------------------------------
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsup
keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8)
#gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =FALSE, compare ="paired",cutoff=0.05)
#lapply(gobpres, head)
#---------------------------------------------------------Define plotting function for applying later
plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse", new.signature=FALSE)
#---------------------------------------------------------plot multiple pathways ups and downs 
tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse"))
tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse"))
} 
20
21
22
23
shell: 
    """
    Rscript scripts/pathway.R {input} {params.organims} {params.comparison} {params.outname}
    """
35
36
37
38
shell:
    """
    Rscript scripts/GO.R {input} {params.organims} {params.comparison} {params.outname}
    """
ShowHide 2 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/SherineAwad/GeneSetEnrichment
Name: genesetenrichment
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: BSD 3-Clause "New" or "Revised" License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...