Workflow Steps and Code Snippets
439 tagged steps and code snippets that match keyword STAR
Repository containing bioinformatic code for macro-scale host transcriptomic data processing (v0.0.1)
18 19 20 21 22 23 24 25 26 27 28 | shell: """ STAR \ --runMode genomeGenerate \ --runThreadN {threads} \ --genomeDir {output.folder} \ --genomeFastaFiles {input.dna} \ --sjdbGTFfile {input.gtf} \ --sjdbOverhang {params.sjdbOverhang} \ 2> {log} 1>&2 """ |
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | shell: """ ulimit -n 90000 2> {log} 1>&2 STAR \ --runMode alignReads \ --runThreadN {threads} \ --genomeDir {input.index} \ --readFilesIn {input.r1} {input.r2} \ --outFileNamePrefix {params.out_prefix} \ --outSAMtype BAM SortedByCoordinate \ --outSAMunmapped Within KeepPairs \ --readFilesCommand "gzip -cd" \ --quantMode GeneCounts \ 2>> {log} 1>&2 """ |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | library(tidyverse) library(argparse) read_star_counts <- function(filename) { # Read only the first two columns. The other two are for strand-specific data # Extract the file name, since we have to match the Illumina name with the # sample name star_column_names <- c( "gene_id", "unstranded", "stranded_forward", "stranded_reverse" ) filename %>% read_tsv( col_names = star_column_names, col_select = 1:2, skip = 4, show_col_types = FALSE ) %>% mutate( sample_id = filename %>% basename() %>% str_remove(".ReadsPerGene.out.tab") ) %>% select(sample_id, gene_id, counts = unstranded) } parser <- ArgumentParser() parser$add_argument( "-i", "--input-folder", type = "character", dest = "input_folder", help = paste( "Folder that contains the STAR counts. Will search recursively for ", "files ended in \".ReadsPerGene.out.tab\"." ) ) parser$add_argument( "-o", "--output-file", type = "character", dest = "output_file", help = paste( "Output file with all the table containing all the counts together" ) ) args <- parser$parse_args() files <- list.files( path = args$input_folder, pattern = "*.ReadsPerGene.out.tab", recursive = TRUE, full.names = TRUE ) counts_raw <- files %>% map(read_star_counts) %>% bind_rows() %>% pivot_wider(names_from = sample_id, values_from = counts) dir.create(dirname(args$output_file)) write_tsv(counts_raw, args$output_file) |
Snakemake workflow: Bioinfo_Macro_Microbial_Metatranscriptomics
18 19 20 21 22 23 24 25 26 27 28 | shell: """ STAR \ --runMode genomeGenerate \ --runThreadN {threads} \ --genomeDir {output.folder} \ --genomeFastaFiles {input.dna} \ --sjdbGTFfile {input.gtf} \ --sjdbOverhang {params.sjdbOverhang} \ 2> {log} 1>&2 """ |
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | shell: """ ulimit -n 90000 2> {log} 1>&2 STAR \ --runMode alignReads \ --runThreadN {threads} \ --genomeDir {input.index} \ --readFilesIn \ {input.r1} \ {input.r2} \ --outFileNamePrefix {params.out_prefix} \ --outSAMtype BAM SortedByCoordinate \ --outSAMunmapped Within KeepPairs \ --outReadsUnmapped Fastx \ --readFilesCommand "gzip -cd" \ --quantMode GeneCounts \ 2>> {log} 1>&2 """ |
Snakemake workflow: Bioinfo_Macro_Microbial_Metatranscriptomics
18 19 20 21 22 23 24 25 26 27 28 | shell: """ STAR \ --runMode genomeGenerate \ --runThreadN {threads} \ --genomeDir {output.folder} \ --genomeFastaFiles {input.dna} \ --sjdbGTFfile {input.gtf} \ --sjdbOverhang {params.sjdbOverhang} \ 2> {log} 1>&2 """ |
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | shell: """ ulimit -n 90000 2> {log} 1>&2 STAR \ --runMode alignReads \ --runThreadN {threads} \ --genomeDir {input.index} \ --readFilesIn \ {input.r1} \ {input.r2} \ --outFileNamePrefix {params.out_prefix} \ --outSAMtype BAM SortedByCoordinate \ --outSAMunmapped Within KeepPairs \ --outReadsUnmapped Fastx \ --readFilesCommand "gzip -cd" \ --quantMode GeneCounts \ 2>> {log} 1>&2 """ |
High throughput Next Generation Sequencing (NGS) data analysis using Python 3 Snakemake
77 78 79 80 | shell: """ ~/miniconda2/bin/STAR --runThreadN {threads} --genomeDir genomeIndex --readFilesIn {input.trimmed1} {input.trimmed2} --outFilterIntronMotifs RemoveNoncanonical --outFileNamePrefix {params.prefix} --outSAMtype BAM SortedByCoordinate --outReadsUnmapped Fastx """ |
Snakemake based analysis pipeline to identify m6As from eCLIP data
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | shell: """ name="{params.name}"; \ name="${{name^}}"; \ ram="$(({params.ram}*1000000000))"; \ curl -s --list-only {params.url_fasta}/ | if grep -q "primary_assembly"; then wget -P {output} {params.url_fasta}/${{name}}.{params.assembly_fasta}.dna.primary_assembly.fa.gz; else wget -P {output} {params.url_fasta}/${{name}}.{params.assembly_fasta}.dna.toplevel.fa.gz; fi; \ wget -P {output} {params.url_gtf}/${{name}}.{params.assembly_gtf}.gtf.gz; \ gzip -dr {output}; \ echo "$(date +%b' '%d' '%H:%M:%S) Indexing genome (takes a while)..."; \ STAR --runThreadN {threads_max} --limitGenomeGenerateRAM ${{ram}} --runMode genomeGenerate --genomeSAsparseD 2 --genomeFastaFiles {output}/*.fa --sjdbGTFfile {output}/*.gtf --genomeDir {output}/index > {log} && \ echo "$(date +%b' '%d' '%H:%M:%S) Annotating genome (takes a while)..." && \ seqkit split {output}/*.fa -i --by-id-prefix "" --id-regexp "([^\s]+)" -O {output}/chroms --quiet && \ awk -F'"' -v OFS='"' '{{for(i=2; i<=NF; i+=2) gsub(";", "-", $i)}} 1' {output}/${{name}}.{params.assembly_gtf}.gtf | gtf2bed --attribute-key=gene_name - > {output}/${{name}}.{params.assembly_gtf}.geneNames.bed && \ gtfToGenePred -genePredExt {output}/${{name}}.{params.assembly_gtf}.gtf {output}/${{name}}.{params.assembly_gtf}.genePred && \ perl workflow/scripts/metaPlotR/make_annot_bed.pl --genomeDir {output}/chroms/ --genePred {output}/${{name}}.{params.assembly_gtf}.genePred > {output}/${{name}}.{params.assembly_gtf}.annotated.bed && \ echo "$(date +%b' '%d' '%H:%M:%S) Sorting annotation..." && \ sort -k1,1 -k2,2n {output}/${{name}}.{params.assembly_gtf}.annotated.bed > {output}/${{name}}.{params.assembly_gtf}.annotated.sorted.bed && \ rm {output}/${{name}}.{params.assembly_gtf}.annotated.bed && \ echo "$(date +%b' '%d' '%H:%M:%S) Calculating size of transcript regions (i.e. 5'UTR, CDS and 3'UTR)..." && \ perl workflow/scripts/metaPlotR/size_of_cds_utrs.pl --annot {output}/${{name}}.{params.assembly_gtf}.annotated.sorted.bed > {output}/${{name}}.{params.assembly_gtf}.region_sizes.txt """ |
144 145 146 147 148 | shell: """ STAR --runThreadN {threads} --runMode alignReads --genomeDir resources/index --readFilesIn {input.read1} {input.read2} --outFileNamePrefix {params.prefix} {params.args} > {output.bam} && \ samtools index -@ {threads} {output.bam} """ |
tool / biotools
STAR
An integrated solution to management and visualization of sequencing data.