Workflow Steps and Code Snippets

439 tagged steps and code snippets that match keyword STAR

Repository containing bioinformatic code for macro-scale host transcriptomic data processing (v0.0.1)

18
19
20
21
22
23
24
25
26
27
28
shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn {input.r1} {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library(tidyverse)
library(argparse)

read_star_counts <- function(filename) {
  # Read only the first two columns. The other two are for strand-specific data
  # Extract the file name, since we have to match the Illumina name with the
  # sample name

  star_column_names <- c(
    "gene_id", "unstranded", "stranded_forward", "stranded_reverse"
  )

  filename %>%
    read_tsv(
      col_names = star_column_names,
      col_select = 1:2,
      skip = 4,
      show_col_types = FALSE
    ) %>%
    mutate(
      sample_id = filename %>%
        basename() %>%
        str_remove(".ReadsPerGene.out.tab")
    ) %>%
    select(sample_id, gene_id, counts = unstranded)
}

parser <- ArgumentParser()

parser$add_argument(
  "-i", "--input-folder",
  type = "character",
  dest = "input_folder",
  help = paste(
    "Folder that contains the STAR counts. Will search recursively for ",
    "files ended in \".ReadsPerGene.out.tab\"."
  )
)

parser$add_argument(
  "-o", "--output-file",
  type = "character",
  dest = "output_file",
  help = paste(
    "Output file with all the table containing all the counts together"
  )
)

args <- parser$parse_args()

files <-
  list.files(
    path = args$input_folder,
    pattern = "*.ReadsPerGene.out.tab",
    recursive = TRUE,
    full.names = TRUE
  )

counts_raw <-
  files %>%
  map(read_star_counts) %>%
  bind_rows() %>%
  pivot_wider(names_from = sample_id, values_from = counts)

dir.create(dirname(args$output_file))
write_tsv(counts_raw, args$output_file)

Snakemake workflow: Bioinfo_Macro_Microbial_Metatranscriptomics

18
19
20
21
22
23
24
25
26
27
28
shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn \
            {input.r1} \
            {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --outReadsUnmapped Fastx \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """

Snakemake workflow: Bioinfo_Macro_Microbial_Metatranscriptomics

18
19
20
21
22
23
24
25
26
27
28
shell:
    """
    STAR \
        --runMode genomeGenerate \
        --runThreadN {threads} \
        --genomeDir {output.folder} \
        --genomeFastaFiles {input.dna} \
        --sjdbGTFfile {input.gtf} \
        --sjdbOverhang {params.sjdbOverhang} \
    2> {log} 1>&2
    """
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
shell:
    """
    ulimit -n 90000 2> {log} 1>&2

    STAR \
        --runMode alignReads \
        --runThreadN {threads} \
        --genomeDir {input.index} \
        --readFilesIn \
            {input.r1} \
            {input.r2} \
        --outFileNamePrefix {params.out_prefix} \
        --outSAMtype BAM SortedByCoordinate \
        --outSAMunmapped Within KeepPairs \
        --outReadsUnmapped Fastx \
        --readFilesCommand "gzip -cd" \
        --quantMode GeneCounts \
    2>> {log} 1>&2
    """

High throughput Next Generation Sequencing (NGS) data analysis using Python 3 Snakemake

77
78
79
80
shell:
	"""
	~/miniconda2/bin/STAR --runThreadN {threads} --genomeDir genomeIndex --readFilesIn {input.trimmed1} {input.trimmed2} --outFilterIntronMotifs RemoveNoncanonical --outFileNamePrefix {params.prefix} --outSAMtype BAM SortedByCoordinate  --outReadsUnmapped Fastx
	"""

Snakemake based analysis pipeline to identify m6As from eCLIP data

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
shell:
    """
    name="{params.name}"; \
    name="${{name^}}"; \
    ram="$(({params.ram}*1000000000))"; \
    curl -s --list-only {params.url_fasta}/ | if grep -q "primary_assembly"; then wget -P {output} {params.url_fasta}/${{name}}.{params.assembly_fasta}.dna.primary_assembly.fa.gz; else wget -P {output} {params.url_fasta}/${{name}}.{params.assembly_fasta}.dna.toplevel.fa.gz; fi; \
    wget -P {output} {params.url_gtf}/${{name}}.{params.assembly_gtf}.gtf.gz; \
    gzip -dr {output}; \
    echo "$(date +%b' '%d' '%H:%M:%S) Indexing genome (takes a while)..."; \
    STAR --runThreadN {threads_max} --limitGenomeGenerateRAM ${{ram}} --runMode genomeGenerate --genomeSAsparseD 2 --genomeFastaFiles {output}/*.fa --sjdbGTFfile {output}/*.gtf --genomeDir {output}/index > {log} && \
    echo "$(date +%b' '%d' '%H:%M:%S) Annotating genome (takes a while)..." && \
    seqkit split {output}/*.fa -i --by-id-prefix "" --id-regexp "([^\s]+)" -O {output}/chroms --quiet && \
    awk -F'"' -v OFS='"' '{{for(i=2; i<=NF; i+=2) gsub(";", "-", $i)}} 1' {output}/${{name}}.{params.assembly_gtf}.gtf | gtf2bed --attribute-key=gene_name - > {output}/${{name}}.{params.assembly_gtf}.geneNames.bed && \
    gtfToGenePred -genePredExt {output}/${{name}}.{params.assembly_gtf}.gtf {output}/${{name}}.{params.assembly_gtf}.genePred && \
    perl workflow/scripts/metaPlotR/make_annot_bed.pl --genomeDir {output}/chroms/ --genePred {output}/${{name}}.{params.assembly_gtf}.genePred > {output}/${{name}}.{params.assembly_gtf}.annotated.bed && \
    echo "$(date +%b' '%d' '%H:%M:%S) Sorting annotation..." && \
    sort -k1,1 -k2,2n {output}/${{name}}.{params.assembly_gtf}.annotated.bed > {output}/${{name}}.{params.assembly_gtf}.annotated.sorted.bed && \
    rm {output}/${{name}}.{params.assembly_gtf}.annotated.bed && \
    echo "$(date +%b' '%d' '%H:%M:%S) Calculating size of transcript regions (i.e. 5'UTR, CDS and 3'UTR)..." && \
    perl workflow/scripts/metaPlotR/size_of_cds_utrs.pl --annot {output}/${{name}}.{params.assembly_gtf}.annotated.sorted.bed > {output}/${{name}}.{params.assembly_gtf}.region_sizes.txt
    """
144
145
146
147
148
shell:
    """
    STAR --runThreadN {threads} --runMode alignReads --genomeDir resources/index --readFilesIn {input.read1} {input.read2} --outFileNamePrefix {params.prefix} {params.args} > {output.bam} && \
    samtools index -@ {threads} {output.bam}
    """
tool / biotools

STAR

An integrated solution to management and visualization of sequencing data.