Repository containing bioinformatic code for macro-scale host transcriptomic data processing

public public 1yr ago Version: v0.0.1 0 bookmarks

##Set up required softwares

Usage

#Clone the git repository in your terminal
git clone git@github.com:3d-omics/Bioinfo_Macro_Host_Transcriptomics.git
#Change directory to the one you cloned in the previous step
cd Bioinfo_Macro_Host_Transcriptomics
#Activate conda environment where you have snakemake
conda activte Snakemake
#run the pipeline with the test data, it will download all the necesary software through conda. It should take less than 5 minutes.
snakemake --use-conda --jobs 8 all
  • Run it with your own data:

    • Edit config/samples.tsv and add your samples and where are they located. Here is an example of the tsv table filled with the information

      image

    • Edit config/features.yml with information regarding the reference you are using like in this example.

      image

    • Edit config/params.yml to change the execution of the steps like in this example

      image

Features

  • FASTQ processing with fastp

  • Mapping with STAR

  • SAM/BAM/CRAM processing with samtools

  • Reports with multiqc and FastQC

DAG

image

References

Code Snippets

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
shell:
    """
    fastp \
        --in1 {input.forward_} \
        --in2 {input.reverse_} \
        --out1 {output.forward_} \
        --out2 {output.reverse_} \
        --unpaired1 {output.unpaired1} \
        --unpaired2 {output.unpaired2} \
        --html {output.html} \
        --json {output.json} \
        --compression 1 \
        --verbose \
        --trim_poly_g \
        --trim_poly_x \
        --adapter_sequence {params.adapter_forward} \
        --adapter_sequence_r2 {params.adapter_reverse} \
        --thread {threads} \
        {params.extra} \
    2> {log} 1>&2
    """
12
13
shell:
    "fastqc --quiet {input} 2> {log} 1>&2"
13
14
15
16
17
shell:
    """
    ln --symbolic $(readlink --canonicalize {input.forward_}) {output.forward_}
    ln --symbolic $(readlink --canonicalize {input.reverse_}) {output.reverse_}
    """
11
12
shell:
    "pigz -dc {input.fa} > {output.fa} 2> {log}"
25
26
shell:
    "pigz -dc {input.gtf} > {output.gtf}"
29
30
31
32
33
34
35
36
37
38
39
40
41
shell:
    """
    multiqc \
        --title {params.library} \
        --force \
        --filename {params.library} \
        --outdir {params.out_dir} \
        --dirs \
        --dirs-depth 1 \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """
12
13
14
15
16
shell:
    """
    echo "samtools_idxstats_xchr: {params.chromosome_x}" >  {output} 2>  {log}
    echo "samtools_idxstats_ychr: {params.chromosome_y}" >> {output} 2>> {log}
    """
32
33
34
35
36
37
38
39
40
41
42
shell:
    """
    multiqc \
        --filename reads \
        --title reads \
        --force \
        --outdir {params.dir} \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """
58
59
60
61
62
63
64
65
66
67
68
shell:
    """
    multiqc \
        --title fastp \
        --force \
        --filename fastp \
        --outdir {params.dir} \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """
84
85
86
87
88
89
90
91
92
93
94
shell:
    """
    multiqc \
        --title star \
        --force \
        --filename star \
        --outdir {params.dir} \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """
11
12
shell:
    "samtools index {input} 2> {log} 1>&2"
27
28
29
30
31
32
33
shell:
    """
    samtools stats \
        --reference {input.reference} \
        {input.cram} \
    > {output.tsv} 2> {log}
    """
47
48
shell:
    "samtools flagstats {input.cram} > {output.txt} 2> {log}"
62
63
shell:
    "samtools idxstats {input.cram} > {output.tsv} 2> {log}"
18
19
20
21
22
23
24
25
26
27
28
shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn {input.r1} {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """
100
101
102
103
104
105
106
107
108
109
110
111
112
shell:
    """
    samtools sort \
        -l 9 \
        -m 1G \
        -o {output.cram} \
        --output-fmt CRAM \
        --reference {input.reference} \
        -@ {threads} \
        -M \
        {input.bam} \
    2> {log} 1>&2
    """
136
137
138
139
140
141
142
shell:
    """
    Rscript workflow/scripts/join_star_table.R \
        --input-folder {params.folder} \
        --output-file {output.tsv} \
    2> {log} 1>&2
    """
SnakeMake From line 136 of rules/star.smk
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(tidyverse)
library(argparse)

read_star_counts <- function(filename) {
  # Read only the first two columns. The other two are for strand-specific data
  # Extract the file name, since we have to match the Illumina name with the
  # sample name

  star_column_names <- c(
    "gene_id", "unstranded", "stranded_forward", "stranded_reverse"
  )

  filename %>%
    read_tsv(
      col_names = star_column_names,
      col_select = 1:2,
      skip = 4,
      show_col_types = FALSE
    ) %>%
    mutate(
      sample_id = filename %>%
        basename() %>%
        str_remove(".ReadsPerGene.out.tab")
    ) %>%
    select(sample_id, gene_id, counts = unstranded)
}

parser <- ArgumentParser()

parser$add_argument(
  "-i", "--input-folder",
  type = "character",
  dest = "input_folder",
  help = paste(
    "Folder that contains the STAR counts. Will search recursively for ",
    "files ended in \".ReadsPerGene.out.tab\"."
  )
)

parser$add_argument(
  "-o", "--output-file",
  type = "character",
  dest = "output_file",
  help = paste(
    "Output file with all the table containing all the counts together"
  )
)

args <- parser$parse_args()

files <-
  list.files(
    path = args$input_folder,
    pattern = "*.ReadsPerGene.out.tab",
    recursive = TRUE,
    full.names = TRUE
  )

counts_raw <-
  files %>%
  map(read_star_counts) %>%
  bind_rows() %>%
  pivot_wider(names_from = sample_id, values_from = counts)

dir.create(dirname(args$output_file))
write_tsv(counts_raw, args$output_file)
ShowHide 13 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/3d-omics/Bioinfo_Macro_Host_Transcriptomics
Name: bioinfo_macro_host_transcriptomics
Version: v0.0.1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Accessed: 2
Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...