scRNA-seq Workflow: STARsolo, Variant Calling, and Analysis

public 1yr ago Version: 5 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

scRNAseq workflow

This workflow runs STARsolo with the appropriate parameters for the particular scRNA-seq technology. Currently, 'indrop_v2', '10x_v1', '10x_v2', '10x_v3' and 'cellseq192' are supported. Optionally, variants can be called suing the GATK RNA-seq workflow. Duplicates are identified using the UB tag for each cell barcode separately. Final variants are hard-filtered, annotated with SNPEff and passed through the R package, SNPRelate, for PCA, MDS and dendrograms.

How to use

Download the pipeline using git clone [email protected]:vari-bbc/scRNAseq.git new_proj_dir_name or git clone https://github.com/vari-bbc/scRNAseq.git new_proj_dir_name depending on if you have an SSH key set up with GitHub or not, respectively.
Put fastq files or symlinks into 'raw_data/'.
Fill out 'samples.tsv':
- sample Sample name; If more than one row has the same sample name, they will be merged.
- fq1 R1 filename
- fq2 R2 filename
- RG Optional. If provided, read groups will not be inferred from fastq headers. Provide in the style specified for --outSAMattrRGline option in STAR. e.g. 'ID:zzz ”DS:z z”' or 'ID:yyy DS:yyyy'
Fill out 'bin/config.yaml' to indicate the location of index files, the scRNA-seq technology etc. See config file comments for more details.

For variant calling , set 'call_variants' to True. To variant call only a subset of the cell barcodes , specify only those barcodes in the 'sample_decoder' file. See config file for more info.
Run qsub -q bbc bin/run_snakemake.sh .

Helpful commands

snakemake -l : Print all the rules and a description of what it does.

Code Snippets

# set the library path
#.libPaths("/secondary/projects/bbc/tools/kin_R_packages/R-3.6.0_20190701_pkges")

knitr::opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=TRUE, dev=c('png','pdf'), fig.width=8, fig.height=8)

R Markdown From line 18 of scripts/read_star_solo_raw.Rmd

library(readr)
library(stringr)
library(dplyr)
library(tibble)
library(Seurat)
library(Matrix)

R Markdown dplyr stringr readr Seurat tibble Matrix From line 27 of scripts/read_star_solo_raw.Rmd

# define function for reading from 10x-style output directory and renaming by Celseq 192 barcode number as listed in Sagar...Gruen paper
read10x_and_bc_num_as_colnames <- function(tenx_dir, decoder){
  counts <- Seurat::Read10X(tenx_dir)

  # check that barcode sequences match
  stopifnot(identical(sort(colnames(counts)), sort(decoder$bc)))

  # rename columns with bc_num
  colnames(counts) <- decoder$bc_num[match(colnames(counts), decoder$bc)]

  tibble::as_tibble(counts, rownames="Gene") %>% dplyr::select(Gene, as.character(1:192))
}

R Markdown From line 38 of scripts/read_star_solo_raw.Rmd

# get input, output, params from the Snakemake pipeline
out_file <- snakemake@output[[1]] %>% str_replace(".html", ".txt")
out_file
starsolo_raw_dir <- dirname(snakemake@input[[1]])
starsolo_raw_dir # print out starsolo dir
decoder <- snakemake@params[["decoder"]]
decoder

# read barcode decoder from file in snakePipes installation
bc_nums_decoder <- read_tsv(decoder, col_names = c("bc_num","bc"))

# read the counts and rename to BC number
counts <- read10x_and_bc_num_as_colnames(starsolo_raw_dir, bc_nums_decoder)
str(counts) # print out str()

# output the counts to files
write_tsv(counts, out_file)

R Markdown From line 55 of scripts/read_star_solo_raw.Rmd

77	sessionInfo()

R Markdown From line 77 of scripts/read_star_solo_raw.Rmd

# save start time for script
start_tm <- Sys.time()
start_tm

R Markdown From line 19 of scripts/snprelate.Rmd

# set the library path
knitr::opts_chunk$set(echo=TRUE, warning=TRUE, message=TRUE, cache=FALSE, dev=c('pdf'), fig.width=8, fig.height=8)

R Markdown From line 25 of scripts/snprelate.Rmd

library(SNPRelate)
library(ggplot2)
library(dplyr)
library(tibble)
library(ggrepel)

R Markdown ggplot2 dplyr ggrepel tibble SNPRelate From line 32 of scripts/snprelate.Rmd

#outdir <- "./snprelate_out"
#if(!outdir %in% list.dirs()) dir.create(outdir, recursive = TRUE)
vcf.fn <- snakemake@input[[1]]
# convert to GDS
gds_file <- snakemake@params[["gds"]]
snpgdsVCF2GDS(vcf.fn, gds_file, method="biallelic.only")

# Figures
old_figs_folder <- snakemake@params[["figures_dir"]]
new.folder <- snakemake@params[["new_figures_dir"]]

# Set maximum missing rate
missingrate <- 0.05

R Markdown From line 45 of scripts/snprelate.Rmd

genofile <- snpgdsOpen(gds_file)

R Markdown From line 63 of scripts/snprelate.Rmd

set.seed(1000)
# Try different LD thresholds for sensitivity analysis
snpset <- snpgdsLDpruning(genofile, ld.threshold=0.5, missing.rate=missingrate) # using LD threshold default used in SNPhylo
# Get all selected snp id
snpset.id <- unlist(snpset)

R Markdown From line 73 of scripts/snprelate.Rmd

set.seed(100)
ibs <- snpgdsIBS(genofile, num.thread=1, autosome.only=TRUE, missing.rate=missingrate, snp.id = snpset.id)
# we find 1- ibs so that the values become 'distances' where higher values represent more dissimilar pairs of samples
one_minus_ibs <- 1 - ibs$ibs
colnames(one_minus_ibs) <- ibs$sample.id
rownames(one_minus_ibs) <- ibs$sample.id

R Markdown From line 85 of scripts/snprelate.Rmd

loc <- cmdscale(one_minus_ibs, k = 2)
#x <- loc[, 1]; y <- loc[, 2]
df <- tibble::as_tibble(loc, rownames="sample.id") %>%
  dplyr::rename(Dimension1=V1, Dimension2=V2)
#df$sample.id <- ibs$sample.id
# plot(x, y, xlab = "", ylab = "",
#     main = "Multidimensional Scaling Analysis (IBS)")
ggplot(df, aes(x=Dimension1, y=Dimension2, label=sample.id)) + 
  geom_point() + ggrepel::geom_label_repel() + ggtitle("MDS (based on IBS)")

R Markdown ggplot2 From line 96 of scripts/snprelate.Rmd

ibs.hc <- snpgdsHCluster(ibs)
rv <- snpgdsCutTree(ibs.hc)

R Markdown From line 112 of scripts/snprelate.Rmd

plot(rv$dendrogram, main="Dendrogram based on IBS")

R Markdown From line 117 of scripts/snprelate.Rmd

#pca <- snpgdsPCA(genofile, snp.id=snpset.id, num.thread=1)
pca <- snpgdsPCA(genofile, num.thread=1, autosome.only=TRUE, missing.rate=missingrate, snp.id = snpset.id)
# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    EV3 = pca$eigenvect[,3],
    EV4 = pca$eigenvect[,4],
    EV5 = pca$eigenvect[,5],
    stringsAsFactors = FALSE)
head(tab)
#plot(tab$EV1, tab$EV2, xlab="eigenvector 1", ylab="eigenvector 2")
ggplot(tab, aes(x=EV1, y=EV2, label=sample.id)) + geom_point() + ggrepel::geom_label_repel() + ggtitle("PC1 and 2")
ggplot(tab, aes(x=EV3, y=EV4, label=sample.id)) + geom_point() + ggrepel::geom_label_repel() + ggtitle("PC3 and 4")

R Markdown ggplot2 From line 123 of scripts/snprelate.Rmd

set.seed(100)
ibs <- snpgdsIBS(genofile, num.thread=1, autosome.only=TRUE, missing.rate=missingrate)
# we find 1- ibs so that the values become 'distances' where higher values represent more dissimilar pairs of samples
one_minus_ibs <- 1 - ibs$ibs
colnames(one_minus_ibs) <- ibs$sample.id
rownames(one_minus_ibs) <- ibs$sample.id

R Markdown From line 150 of scripts/snprelate.Rmd

loc <- cmdscale(one_minus_ibs, k = 2)
df <- tibble::as_tibble(loc, rownames="sample.id") %>%
  dplyr::rename(Dimension1=V1, Dimension2=V2)
ggplot(df, aes(x=Dimension1, y=Dimension2, label=sample.id)) + 
  geom_point() + ggrepel::geom_label_repel() + ggtitle("MDS (based on IBS)")

R Markdown ggplot2 From line 161 of scripts/snprelate.Rmd

ibs.hc <- snpgdsHCluster(ibs)
rv <- snpgdsCutTree(ibs.hc)

R Markdown From line 173 of scripts/snprelate.Rmd

plot(rv$dendrogram, main="Dendrogram based on IBS")

R Markdown From line 178 of scripts/snprelate.Rmd

#pca <- snpgdsPCA(genofile, snp.id=snpset.id, num.thread=1)
pca <- snpgdsPCA(genofile, num.thread=1, autosome.only=TRUE, missing.rate=missingrate)
# variance proportion (%)
pc.percent <- pca$varprop*100
head(round(pc.percent, 2))
tab <- data.frame(sample.id = pca$sample.id,
    EV1 = pca$eigenvect[,1],    # the first eigenvector
    EV2 = pca$eigenvect[,2],    # the second eigenvector
    EV3 = pca$eigenvect[,3],
    EV4 = pca$eigenvect[,4],
    EV5 = pca$eigenvect[,5],
    stringsAsFactors = FALSE)
head(tab)
#plot(tab$EV1, tab$EV2, xlab="eigenvector 1", ylab="eigenvector 2")
ggplot(tab, aes(x=EV1, y=EV2, label=sample.id)) + geom_point() + ggrepel::geom_label_repel() + ggtitle("PC1 and 2")
ggplot(tab, aes(x=EV3, y=EV4, label=sample.id)) + geom_point() + ggrepel::geom_label_repel() + ggtitle("PC3 and 4")

R Markdown ggplot2 From line 184 of scripts/snprelate.Rmd

closefn.gds(genofile)

R Markdown From line 206 of scripts/snprelate.Rmd

dir.create(new.folder, recursive = TRUE)

file.copy(list.files(old_figs_folder, full.names = TRUE), new.folder, recursive = TRUE)

R Markdown From line 211 of scripts/snprelate.Rmd

220	sessionInfo()

R Markdown From line 220 of scripts/snprelate.Rmd

# output time taken to run script
end_tm <- Sys.time()
end_tm
end_tm - start_tm

R Markdown From line 226 of scripts/snprelate.Rmd

shell:
    """
    fastqc --outdir {params.outdir} {input}
    """

SnakeMake FastQC From line 111 of main/Snakefile

shell:
    """
    fastq_screen --threads {threads} --outdir analysis/fastq_screen/ {input}
    """

SnakeMake From line 136 of main/Snakefile

shell:
   """
   STAR  \
   --runThreadN {threads} \
   --limitBAMsortRAM 137438953472 \
   --genomeDir {params.index} \
   --readFilesIn {params.fqs_and_rg[fqs]} \
   --outSAMattrRGline {params.fqs_and_rg[RG]} \
   --outSAMattributes NH HI AS nM CB UB \
   --readFilesCommand zcat  \
   --outSAMtype BAM SortedByCoordinate \
   --outFileNamePrefix {params.outprefix} \
   {params.tech_params}

   samtools index {params.outprefix}Aligned.sortedByCoord.out.bam

   pigz -p {threads} {params.outprefix}Solo.out/Gene/raw/*
   pigz -p {threads} {params.outprefix}Solo.out/Gene/filtered/*
   """

SnakeMake SAMtools STAR From line 281 of main/Snakefile

shell:
   """
   gunzip -c {input} > {output}
   """

SnakeMake From line 318 of main/Snakefile

script:
    "scripts/read_star_solo_raw.Rmd"

SnakeMake From line 341 of main/Snakefile

shell:
    """
    bamtools split -in {input} -tag CB -stub {params.out_pref}
    """

SnakeMake BamTools From line 366 of main/Snakefile

shell:
    """
    samtools reheader -c 'perl -pe "s/^(@SQ.*)(\\tSM:\S+)/\$1\$2.{wildcards.CB}/"' {params.input} 1> {output} 2> {log.stderr}
    """

SnakeMake SAMtools From line 391 of main/Snakefile

shell:
    """
    java -Xms16g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp -jar $PICARD MarkDuplicates \
    --INPUT {input} \
    --BARCODE_TAG 'UB' \
    --OUTPUT {output.bam} \
    --CREATE_INDEX true \
    --VALIDATION_STRINGENCY SILENT \
    --METRICS_FILE {output.metrics}
    """

SnakeMake From line 419 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    SplitNCigarReads \
    -R {params.ref_fasta} \
    -I {input} \
    -O {output} 
    """

SnakeMake gatk From line 450 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -XX:+UseParallelGC -XX:ParallelGCThreads={threads} -Djava.io.tmpdir=./tmp" \
        BaseRecalibrator \
        -R {params.ref_fasta} \
        -I {input} \
        -O {output} \
        {params.known_variants}
    """

SnakeMake gatk From line 480 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g  -XX:+UseParallelGC -XX:ParallelGCThreads={threads} -Djava.io.tmpdir=./tmp" \
        ApplyBQSR \
        --add-output-sam-program-record \
        -R {params.ref_fasta} \
        -I {input.bam} \
        -O {output} \
        --bqsr-recal-file {input.recal_table}
    """

SnakeMake gatk From line 511 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    HaplotypeCaller \
    -R {params.ref_fasta} \
    -I {input.bam} \
    -O {output} \
    -ERC GVCF \
    -dont-use-soft-clipped-bases \
    --native-pair-hmm-threads {threads} \
    --standard-min-confidence-threshold-for-calling 20 \
    {params.contigs} 
    """

SnakeMake gatk From line 544 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    GenomicsDBImport \
    {params.sample_gvcfs} \
    --genomicsdb-workspace-path {output.genomicsdb} \
    {params.contigs} 
    """

SnakeMake gatk From line 607 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    GenotypeGVCFs \
    -R {params.ref_fasta} \
    -V gendb://{params.genomicsdb} \
    -O {output.vcf}

    """

SnakeMake gatk From line 637 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    SortVcf \
    -I {input.vcf} \
    -O {output.sorted_vcf} \
    -SD {params.dictionary} 

    """

SnakeMake gatk From line 667 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    MergeVcfs \
    {params.in_vcfs} \
    --SEQUENCE_DICTIONARY {params.dictionary} \
    --OUTPUT {output.raw} 

    echo "mergeVcfs done." >> {log.stdout}
    echo "mergeVcfs done." >> {log.stderr}

    vt peek -r {params.ref_fasta} {output.raw} 2> {output.vt_peek_raw} 1>>{log.stdout}

    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    VariantFiltration \
    --R {params.ref_fasta} \
    --V {output.raw} \
    --window 35 \
    --cluster 3 \
    --filter-name "FS" \
    --filter "FS > 30.0" \
    --filter-name "QD" \
    --filter "QD < 2.0" \
    --genotype-filter-name "GQ" \
    --genotype-filter-expression "GQ < 15.0" \
    --genotype-filter-name "DP" \
    --genotype-filter-expression "DP < 10.0" \
    -O {output.filt} 

    echo "VariantFiltration done." >> {log.stdout}
    echo "VariantFiltration done." >> {log.stderr}

    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
    SelectVariants \
    -R {params.ref_fasta} \
    -V {output.filt} \
    --exclude-filtered \
    --set-filtered-gt-to-nocall \
    -O {output.pass_only} 

    echo "SelectVariants done." >> {log.stdout}
    echo "SelectVariants done." >> {log.stderr}

    vt peek -r {params.ref_fasta} {output.pass_only} 2> {output.vt_peek_pass} 1>>{log.stdout}
    """

SnakeMake gatk vt From line 704 of main/Snakefile

shell:
    """
    bcftools reheader --threads {threads} -s {params.sample_decoder} -o {output} {input.vcf}
    bcftools index --threads {threads} -t {output}
    """

SnakeMake BCFtools From line 770 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" VariantsToTable -V {input} \
            -R {params.ref_fasta} --sequence-dictionary {params.dictionary} \
            -F CHROM -F POS -F REF -F ALT -F TYPE -F ANN -GF AD -O {output.raw}

    head -n1 {output.raw} | perl -npe 's/\\tANN\\t/\\tGENE\\t/; s/\.AD(?=[\t\n])//g' > {output.parsed}

    # Parse the comma-separated ANN column (column 5, 0-based counting). Each comma-separated element contains a '|' separated string; We extract fields 3 and 4, 0-based for the gene names.
    tail -n+2 {output.raw} | perl -F'\\t' -lane 'my %hash; $hash{{join("|",(split(/\|/,$_))[3..4])}}++ foreach split(",", $F[5]); $F[5] = join(",", sort(keys(%hash))); print join("\\t", @F)' >> {output.parsed}
    """

SnakeMake gatk From line 800 of main/Snakefile

shell:
    """
    # Only use canonical transcript of each gene
    java -Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp -jar $SNPEFF/snpEff.jar eff \
    -v \
    -canon \
    -stats {output.html_canon} \
    {params.db_id} \
    {input} | \
    bgzip > {output.vcf_canon}

    tabix {output.vcf_canon}

    # 'default' settings
    java -Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp -jar $SNPEFF/snpEff.jar eff \
    -v \
    -stats {output.html} \
    {params.db_id} \
    {input} | \
    bgzip > {output.vcf}

    tabix {output.vcf}

    # Only annotate variants that overlap genes
    java -Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp -jar $SNPEFF/snpEff.jar eff \
    -v \
    -no-downstream \
    -no-intergenic \
    -no-upstream \
    -stats {output.html_inGene} \
    {params.db_id} \
    {input} | \
    bgzip > {output.vcf_inGene}

    tabix {output.vcf_inGene}

    """

SnakeMake tabix From line 841 of main/Snakefile

script:
    "scripts/snprelate.Rmd"

SnakeMake From line 899 of main/Snakefile

shell:
    """
    gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \
            ASEReadCounter \
            -R {params.ref_fasta} \
            -I {input} \
            -V {params.vcf} \
            --min-base-quality 20 \
            --min-mapping-quality 30 \
            -O {output}
    """