Repository containing bioinformatic code for macro-scale host transcriptomic data processing

public 1yr ago Version: v0.0.1 0 bookmarks

View Workflow

bioinfo_macro_host_transcriptomics — View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

##Set up required softwares

Usage

#Clone the git repository in your terminal
git clone git@github.com:3d-omics/Bioinfo_Macro_Host_Transcriptomics.git
#Change directory to the one you cloned in the previous step
cd Bioinfo_Macro_Host_Transcriptomics
#Activate conda environment where you have snakemake
conda activte Snakemake
#run the pipeline with the test data, it will download all the necesary software through conda. It should take less than 5 minutes.
snakemake --use-conda --jobs 8 all

Run it with your own data:
- Edit config/samples.tsv and add your samples and where are they located. Here is an example of the tsv table filled with the information
- Edit config/features.yml with information regarding the reference you are using like in this example.
- Edit config/params.yml to change the execution of the steps like in this example

Features

FASTQ processing with fastp
Mapping with STAR
SAM/BAM/CRAM processing with samtools
Reports with multiqc and FastQC

DAG

References

Code Snippets

shell:
    """
    fastp \
        --in1 {input.forward_} \
        --in2 {input.reverse_} \
        --out1 {output.forward_} \
        --out2 {output.reverse_} \
        --unpaired1 {output.unpaired1} \
        --unpaired2 {output.unpaired2} \
        --html {output.html} \
        --json {output.json} \
        --compression 1 \
        --verbose \
        --trim_poly_g \
        --trim_poly_x \
        --adapter_sequence {params.adapter_forward} \
        --adapter_sequence_r2 {params.adapter_reverse} \
        --thread {threads} \
        {params.extra} \
    2> {log} 1>&2
    """

SnakeMake fastp From line 27 of rules/fastp.smk

shell:
    "fastqc --quiet {input} 2> {log} 1>&2"

SnakeMake FastQC From line 12 of rules/fastqc.smk

shell:
    """
    ln --symbolic $(readlink --canonicalize {input.forward_}) {output.forward_}
    ln --symbolic $(readlink --canonicalize {input.reverse_}) {output.reverse_}
    """

SnakeMake From line 13 of rules/reads.smk

shell:
    "pigz -dc {input.fa} > {output.fa} 2> {log}"

SnakeMake From line 11 of rules/reference.smk

shell:
    "pigz -dc {input.gtf} > {output.gtf}"

SnakeMake From line 25 of rules/reference.smk

shell:
    """
    multiqc \
        --title {params.library} \
        --force \
        --filename {params.library} \
        --outdir {params.out_dir} \
        --dirs \
        --dirs-depth 1 \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 29 of rules/report_library.smk

shell:
    """
    echo "samtools_idxstats_xchr: {params.chromosome_x}" >  {output} 2>  {log}
    echo "samtools_idxstats_ychr: {params.chromosome_y}" >> {output} 2>> {log}
    """

SnakeMake From line 12 of rules/report_step.smk

shell:
    """
    multiqc \
        --filename reads \
        --title reads \
        --force \
        --outdir {params.dir} \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 32 of rules/report_step.smk

shell:
    """
    multiqc \
        --title fastp \
        --force \
        --filename fastp \
        --outdir {params.dir} \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC fastp From line 58 of rules/report_step.smk

shell:
    """
    multiqc \
        --title star \
        --force \
        --filename star \
        --outdir {params.dir} \
        --config {input.config} \
        {input} \
    2> {log} 1>&2
    """

SnakeMake MultiQC From line 84 of rules/report_step.smk

shell:
    "samtools index {input} 2> {log} 1>&2"

SnakeMake SAMtools From line 11 of rules/samtools.smk

shell:
    """
    samtools stats \
        --reference {input.reference} \
        {input.cram} \
    > {output.tsv} 2> {log}
    """

SnakeMake SAMtools From line 27 of rules/samtools.smk

shell:
    "samtools flagstats {input.cram} > {output.txt} 2> {log}"

SnakeMake SAMtools From line 47 of rules/samtools.smk

shell:
    "samtools idxstats {input.cram} > {output.tsv} 2> {log}"

SnakeMake SAMtools From line 62 of rules/samtools.smk

shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """

SnakeMake STAR From line 18 of rules/star.smk

shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn {input.r1} {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """

SnakeMake STAR From line 54 of rules/star.smk

shell:
    """
    samtools sort \
        -l 9 \
        -m 1G \
        -o {output.cram} \
        --output-fmt CRAM \
        --reference {input.reference} \
        -@ {threads} \
        -M \
        {input.bam} \
    2> {log} 1>&2
    """

SnakeMake SAMtools From line 100 of rules/star.smk

shell:
    """
    Rscript workflow/scripts/join_star_table.R \
        --input-folder {params.folder} \
        --output-file {output.tsv} \
    2> {log} 1>&2
    """

SnakeMake From line 136 of rules/star.smk

library(tidyverse)
library(argparse)

read_star_counts <- function(filename) {
  # Read only the first two columns. The other two are for strand-specific data
  # Extract the file name, since we have to match the Illumina name with the
  # sample name

  star_column_names <- c(
    "gene_id", "unstranded", "stranded_forward", "stranded_reverse"
  )

  filename %>%
    read_tsv(
      col_names = star_column_names,
      col_select = 1:2,
      skip = 4,
      show_col_types = FALSE
    ) %>%
    mutate(
      sample_id = filename %>%
        basename() %>%
        str_remove(".ReadsPerGene.out.tab")
    ) %>%
    select(sample_id, gene_id, counts = unstranded)
}

parser <- ArgumentParser()

parser$add_argument(
  "-i", "--input-folder",
  type = "character",
  dest = "input_folder",
  help = paste(
    "Folder that contains the STAR counts. Will search recursively for ",
    "files ended in \".ReadsPerGene.out.tab\"."
  )
)

parser$add_argument(
  "-o", "--output-file",
  type = "character",
  dest = "output_file",
  help = paste(
    "Output file with all the table containing all the counts together"
  )
)

args <- parser$parse_args()

files <-
  list.files(
    path = args$input_folder,
    pattern = "*.ReadsPerGene.out.tab",
    recursive = TRUE,
    full.names = TRUE
  )

counts_raw <-
  files %>%
  map(read_star_counts) %>%
  bind_rows() %>%
  pivot_wider(names_from = sample_id, values_from = counts)

dir.create(dirname(args$output_file))
write_tsv(counts_raw, args$output_file)