Snakemake project for common mm10 reference files

public public 1yr ago 0 bookmarks

This repository provides a Snakemake pipeline to download and transform mm10-related reference files that are used in bioinformatics workflows. Among the derivable files include:

  • UCSC genome FASTA

  • GENCODE GFFs/GTFs

  • salmon and kallisto indices

  • filtered versions of GENCODE (protein-coding, verified 3'-ends)

  • truncated versions of transcriptome annotations (used for 3'-end tag-based RNA-seq)

This serves as a record of data provenance.

Code Snippets

 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
if (interactive()) {
  library(methods)
  Snakemake <- setClass(
    "Snakemake", 
    slots=c(
      input='list', 
      output='list',
      wildcards='list',
      threads='numeric'
    )
  )
  snakemake <- Snakemake(
      input=list(gtf="gencode.vM21.annotation.mRNA_ends_found.gtf.gz"),
      output=list(gtf="/fscratch/fanslerm/gencode.vM21.annotation.mRNA_ends_found.txcutr.w500.gtf",
                  fa="/fscratch/fanslerm/gencode.vM21.annotation.mRNA_ends_found.txcutr.w500.fa"),
      wildcards=list(width="500"),
      threads=1
  )
}

################################################################################
## Libraries and Parameters
################################################################################

library(txcutr)
library(BSgenome.Mmusculus.UCSC.mm10)
library(GenomicFeatures)

mm10 <- BSgenome.Mmusculus.UCSC.mm10

maxTxLength <- as.integer(snakemake@wildcards$width)

## set cores
BiocParallel::register(BiocParallel::MulticoreParam(snakemake@threads))

################################################################################
## Load Data, Truncate, and Export
################################################################################

txdb <- makeTxDbFromGFF(file=snakemake@input$gtf,
                        organism="Mus musculus",
                        taxonomyId=10090,
                        chrominfo=seqinfo(mm10))

txdb_result <- truncateTxome(txdb, maxTxLength)

exportGTF(txdb_result, snakemake@output$gtf)

exportFASTA(txdb_result, mm10, snakemake@output$fa)
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
if (interactive()) {
  library(methods)
  Snakemake <- setClass(
    "Snakemake", 
    slots=c(
      input='list', 
      output='list', 
      wildcards='list',
      threads='numeric'
    )
  )
  snakemake <- Snakemake(
    input=list(),
    output=list(gtf="txdb.mm10.ensGene.txcutr.w{width}.gtf",
                fa="txdb.mm10.ensGene.txcutr.w{width}.fa"),
    wildcards=list(width="500"),
    threads=1
  )
}

################################################################################
## Libraries and Parameters
################################################################################

library(txcutr)
library(BSgenome.Mmusculus.UCSC.mm10)
library(TxDb.Mmusculus.UCSC.mm10.ensGene)

mm10 <- BSgenome.Mmusculus.UCSC.mm10
txdb <- TxDb.Mmusculus.UCSC.mm10.ensGene

maxTxLength <- as.integer(snakemake@wildcards$width)

## set cores
BiocParallel::register(BiocParallel::MulticoreParam(snakemake@threads))

################################################################################
## Truncate and Export
################################################################################

txdb_result <- truncateTxome(txdb, maxTxLength)

exportGTF(txdb_result, snakemake@output$gtf)

exportFASTA(txdb_result, mm10, snakemake@output$fa)
18
19
20
21
shell:
    """
    wget -O {output} {params.url}
    """
33
34
35
36
37
38
39
shell:
    """
    wget -O- {params.url} | sort -k1,1 -k2,2n | bgzip > {output.bed}
    tabix {output.bed}
    wget -O- {params.url_tissue} | sort -k1,1 -k2,2n | bgzip > {output.bed_tissue}
    tabix {output.bed_tissue}
    """
50
51
52
53
54
55
shell:
    """
    wget {params.url_base}{output.gff}
    wget {params.url_base}{output.gtf}
    wget {params.url_base}{output.fa}
    """
65
66
67
68
shell:
    """
    gzip -cd {input} | sed 's/|.*//' | pigz -p {threads} > {output}
    """
73
74
75
76
77
shell:
    """
    wget -qO - 'http://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz' |
    gunzip -c > {output}
    """
88
89
90
91
shell:
    """
    kallisto index -i {output} {input}
    """
100
101
102
103
104
shell:
    """
    gzip -cd {input} | awk '!( $0 ~ /^#/ )' | sort --parallel={threads} -S4G -k1,1 -k4,4n | bgzip -c > {output.gff}
    tabix {output.gff}
    """
109
110
111
112
shell:
    """
    mysql --user=genome -N --host=genome-mysql.cse.ucsc.edu -A -D mm10 -e 'select name,name2 from refGene' > {output}
    """
SnakeMake From line 109 of master/Snakefile
120
121
122
123
124
125
126
shell:
    """
    gzip -cd {input} |
    awk '$0 !~ /mRNA_end_NF/' |
    bgzip -c > {output.gff}
    tabix {output.gff}
    """
133
134
135
136
137
138
shell:
    """
    gzip -cd {input} |
    awk '$0 !~ /mRNA_end_NF/' |
    bgzip -c > {output.gtf}
    """
SnakeMake From line 133 of master/Snakefile
147
148
149
150
151
152
153
shell:
    """
    gzip -cd {input} |
    grep -v '^#' |
    gtfToGenePred /dev/stdin /dev/stdout |
    genePredToBed stdin {output}
    """
160
161
162
163
shell:
    """
    gzip -cd {input} | awk -f scripts/gtf_tx2gene.awk > {output}
    """
SnakeMake From line 160 of master/Snakefile
168
169
170
171
172
shell:
    """
    wget -qO - 'https://github.com/morrislab/qapa/releases/download/v1.3.0/qapa_3utrs.gencode_VM22.mm10.bed.gz' |
    gunzip -c > {output}
    """
SnakeMake From line 168 of master/Snakefile
182
183
184
185
shell:
    """
    qapa fasta -f {input.fa} {input.bed} {output}
    """
195
196
197
198
shell:
    """
    salmon index -t {input} -i {output}
    """
213
214
215
216
217
218
219
220
221
shell:
    """
    grep "^>" {input.genome} | cut -d ' ' -f 1 > {params.decoy}
    sed -i.bak -e 's/>//g' {params.decoy}
    cat {input.qapa} {input.genome} > {params.fa}
    salmon index -t {params.fa} -d {params.decoy} -p {threads} -i {output}
    rm {params.decoy}
    rm {params.fa}
    """
226
227
228
229
230
231
232
233
shell:
    """
    mysql --user anonymous --host=martdb.ensembl.org --port=5316 -A ensembl_mart_{wildcards.version} \\
    -e "select stable_id_1023 as 'Gene stable ID', stable_id_1066 as 'Transcript stable ID', \\
    biotype_1020 as 'Gene type', biotype_1064 as 'Transcript type', \\
    display_label_1074 as 'Gene name' from mmusculus_gene_ensembl__transcript__main" \\
    > {output}
    """
SnakeMake From line 226 of master/Snakefile
245
script: "scripts/txcutr_txdb_ensgene.R"
SnakeMake From line 245 of master/Snakefile
259
script: "scripts/txcutr_gtf.R"
SnakeMake From line 259 of master/Snakefile
270
271
272
273
shell:
    """
    kallisto index -i {output} {input}
    """
ShowHide 15 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/mfansler/mm10-db
Name: mm10-db
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...