Workflow Steps and Code Snippets

439 tagged steps and code snippets that match keyword STAR

Repository containing bioinformatic code for macro-scale host transcriptomic data processing (v0.0.1)

shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """

SnakeMake STAR From line 18 of rules/star.smk

shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn {input.r1} {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """

SnakeMake STAR From line 54 of rules/star.smk

library(tidyverse)
library(argparse)

read_star_counts <- function(filename) {
  # Read only the first two columns. The other two are for strand-specific data
  # Extract the file name, since we have to match the Illumina name with the
  # sample name

  star_column_names <- c(
    "gene_id", "unstranded", "stranded_forward", "stranded_reverse"
  )

  filename %>%
    read_tsv(
      col_names = star_column_names,
      col_select = 1:2,
      skip = 4,
      show_col_types = FALSE
    ) %>%
    mutate(
      sample_id = filename %>%
        basename() %>%
        str_remove(".ReadsPerGene.out.tab")
    ) %>%
    select(sample_id, gene_id, counts = unstranded)
}

parser <- ArgumentParser()

parser$add_argument(
  "-i", "--input-folder",
  type = "character",
  dest = "input_folder",
  help = paste(
    "Folder that contains the STAR counts. Will search recursively for ",
    "files ended in \".ReadsPerGene.out.tab\"."
  )
)

parser$add_argument(
  "-o", "--output-file",
  type = "character",
  dest = "output_file",
  help = paste(
    "Output file with all the table containing all the counts together"
  )
)

args <- parser$parse_args()

files <-
  list.files(
    path = args$input_folder,
    pattern = "*.ReadsPerGene.out.tab",
    recursive = TRUE,
    full.names = TRUE
  )

counts_raw <-
  files %>%
  map(read_star_counts) %>%
  bind_rows() %>%
  pivot_wider(names_from = sample_id, values_from = counts)

dir.create(dirname(args$output_file))
write_tsv(counts_raw, args$output_file)

R tidyverse STAR argparse From line 3 of scripts/join_star_table.R

Snakemake workflow: Bioinfo_Macro_Microbial_Metatranscriptomics

shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """

SnakeMake STAR From line 18 of rules/star.smk

shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn \
            {input.r1} \
            {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --outReadsUnmapped Fastx \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """

SnakeMake STAR From line 53 of rules/star.smk

Snakemake workflow: Bioinfo_Macro_Microbial_Metatranscriptomics

shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """

SnakeMake STAR From line 18 of rules/star.smk

shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn \
            {input.r1} \
            {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --outReadsUnmapped Fastx \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """

SnakeMake STAR From line 53 of rules/star.smk

High throughput Next Generation Sequencing (NGS) data analysis using Python 3 Snakemake

shell:
	"""
	~/miniconda2/bin/STAR --runThreadN {threads} --genomeDir genomeIndex --readFilesIn {input.trimmed1} {input.trimmed2} --outFilterIntronMotifs RemoveNoncanonical --outFileNamePrefix {params.prefix} --outSAMtype BAM SortedByCoordinate  --outReadsUnmapped Fastx
	"""

SnakeMake STAR From line 77 of master/Snakefile

Snakemake based analysis pipeline to identify m6As from eCLIP data

shell:
    """
    name="{params.name}"; \
    name="${{name^}}"; \
    ram="$(({params.ram}*1000000000))"; \
    curl -s --list-only {params.url_fasta}/ | if grep -q "primary_assembly"; then wget -P {output} {params.url_fasta}/${{name}}.{params.assembly_fasta}.dna.primary_assembly.fa.gz; else wget -P {output} {params.url_fasta}/${{name}}.{params.assembly_fasta}.dna.toplevel.fa.gz; fi; \
    wget -P {output} {params.url_gtf}/${{name}}.{params.assembly_gtf}.gtf.gz; \
    gzip -dr {output}; \
    echo "$(date +%b' '%d' '%H:%M:%S) Indexing genome (takes a while)..."; \
    STAR --runThreadN {threads_max} --limitGenomeGenerateRAM ${{ram}} --runMode genomeGenerate --genomeSAsparseD 2 --genomeFastaFiles {output}/*.fa --sjdbGTFfile {output}/*.gtf --genomeDir {output}/index > {log} && \
    echo "$(date +%b' '%d' '%H:%M:%S) Annotating genome (takes a while)..." && \
    seqkit split {output}/*.fa -i --by-id-prefix "" --id-regexp "([^\s]+)" -O {output}/chroms --quiet && \
    awk -F'"' -v OFS='"' '{{for(i=2; i<=NF; i+=2) gsub(";", "-", $i)}} 1' {output}/${{name}}.{params.assembly_gtf}.gtf | gtf2bed --attribute-key=gene_name - > {output}/${{name}}.{params.assembly_gtf}.geneNames.bed && \
    gtfToGenePred -genePredExt {output}/${{name}}.{params.assembly_gtf}.gtf {output}/${{name}}.{params.assembly_gtf}.genePred && \
    perl workflow/scripts/metaPlotR/make_annot_bed.pl --genomeDir {output}/chroms/ --genePred {output}/${{name}}.{params.assembly_gtf}.genePred > {output}/${{name}}.{params.assembly_gtf}.annotated.bed && \
    echo "$(date +%b' '%d' '%H:%M:%S) Sorting annotation..." && \
    sort -k1,1 -k2,2n {output}/${{name}}.{params.assembly_gtf}.annotated.bed > {output}/${{name}}.{params.assembly_gtf}.annotated.sorted.bed && \
    rm {output}/${{name}}.{params.assembly_gtf}.annotated.bed && \
    echo "$(date +%b' '%d' '%H:%M:%S) Calculating size of transcript regions (i.e. 5'UTR, CDS and 3'UTR)..." && \
    perl workflow/scripts/metaPlotR/size_of_cds_utrs.pl --annot {output}/${{name}}.{params.assembly_gtf}.annotated.sorted.bed > {output}/${{name}}.{params.assembly_gtf}.region_sizes.txt
    """

SnakeMake STAR seqkit GFFutils gtftogenepred From line 22 of workflow/Snakefile

shell:
    """
    STAR --runThreadN {threads} --runMode alignReads --genomeDir resources/index --readFilesIn {input.read1} {input.read2} --outFileNamePrefix {params.prefix} {params.args} > {output.bam} && \
    samtools index -@ {threads} {output.bam}
    """