Ribo-seq_pipeline Snakemake workflow

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

This is a ribo-seq pipeline structured by snakemake template. It is in development. Insert your code into the respective folders, i.e. scripts , rules , and envs . Define the entry point of the workflow in the Snakefile and the main configuration in the config.yaml file.

Authors

chaodi (@dic)

Usage

Running on respublica by: snakemake --use-conda -c "qsub -l h_vmem={params.mem} -l mem_free={params.mem} -pe smp {threads} -V -cwd -e qsub/{params.jobName}.e -o qsub/{params.jobName}.o" -j -p

Code Snippets

shell:
    "fastq-dump -A {wildcards.sample} --gzip -O {params.outdir} {params.sraID}"

SnakeMake From line 15 of rules/download_fastq.smk

shell:
    '''
    factor=`cat {input.rpm_factors} |  grep {wildcards.sample} | cut -f2`;
    genomeCoverageBed -split -bg -ibam {input.bam} -scale $factor 1> bw_rpm/{wildcards.sample}.bg 2> {log};
    bedtools sort -i bw_rpm/{wildcards.sample}.bg 1> bw_rpm/{wildcards.sample}.sort.bg 2>> {log};
    bedGraphToBigWig bw_rpm/{wildcards.sample}.sort.bg STAR_index/chrNameLength.txt {output} 2>> {log};
    rm bw_rpm/{wildcards.sample}.bg bw_rpm/{wildcards.sample}.sort.bg
    '''

SnakeMake BEDTools bedGraphToBigWig From line 15 of rules/get_bw.smk

shell:
    '''
    rm -f {output};
    for i in {input}; do
        sampleName="$(basename $i .Log.final.out)";
        cat $i | grep 'Number of input reads' | awk '{{print $6}}' > foo1;
        cat $i | grep 'Uniquely mapped reads number' | awk '{{print $6}}' > foo2;
        cat $i | grep 'Number of reads mapped to multiple loci' | awk '{{print $9}}' > foo3;
        cat $i | grep "Uniquely mapped reads %" | awk '{{print $6}}' > foo4;
        cat $i | grep "% of reads mapped to multiple loci"|awk '{{print $9}}' > foo5;
        paste foo1 foo2 foo3 foo4 foo5 | awk '{{print "'$sampleName'\t"$1"\t"$2+$3"\t"$4+$5}}' >> {output.mappedReads}
    done
    cat {output.mappedReads} | awk '{{print $1"\t"1000000/$2}}' > {output.rpmFactor}
    sed -i '1isample\ttotal_reads\tmapped_reads\t%mapped' {output.mappedReads};
    rm -f foo*
    '''

SnakeMake From line 10 of rules/post_stat.smk

shell:
    '''
    bowtie -x {params.index} --threads {threads} {input} --un cleaned_fq/{wildcards.sample}.cleaned.fq -S cleaned_fq/{wildcards.sample}.sam &> {log}
    gzip cleaned_fq/{wildcards.sample}.cleaned.fq 
    rm -f cleaned_fq/{wildcards.sample}.sam
    '''

SnakeMake Bowtie 2 From line 17 of rules/remove_ncRNAs.smk

shell:
    '''
    rm -f {output};
    for i in {input}; do
        sampleName="$(basename $i .log)";
        cat $i | grep "reads processed" | awk -F": " '{{print $2}}'  > foo1;
        cat $i | grep "reads with at least one alignment" | awk -F": " '{{print $2}}' | sed 's/ /\t/g;s/(//g;s/%)//g' > foo2;
        paste foo1 foo2 | awk '{{print "'$sampleName'\t"$1"\t"$2"\t"$3}}' >> {output};
    done
    sed -i '1isample\ttotal_reads\tmapped_reads\t%mapped' {output};
    rm -f foo*
    '''

SnakeMake From line 32 of rules/remove_ncRNAs.smk

shell:
    "samtools sort {input} -o {output}"

SnakeMake SAMtools From line 10 of rules/riboWaltz.smk

shell:
    '''
    rm -f {log} 
    for i in {input}; do
        sampleName="$(basename $i .Aligned.toTranscriptome.out.sorted.bam)" 2>> {log};
        sample_shortName=`echo $sampleName | sed 's/RibosomeProfiling_//g'` 2>> {log};
        ln -sf ../$i {params.outdir}/$sample_shortName.bam 2>> {log}
    done
    '''

SnakeMake From line 26 of rules/riboWaltz.smk

script:
    "../scripts/riboWaltz.R"

SnakeMake From line 57 of rules/riboWaltz.smk

script:
    "../scripts/featureCount.R"

SnakeMake From line 14 of rules/RPF_analysis.smk

shell:
    '''
    # module load perl/5.26.1,  then need to export the perl path, or use the full perl path directly
    mkdir -p ../results/tmp
    /cm/shared/apps_chop/perl/5.26.1/bin/perl ~/public/tools/Pausepred_offline/offline_pausepred.pl \
    {input.bamfile} \
    1000 20 {input.transcript_fasta} \
    28,29,30 10 50 50 0,0,0 ../results/tmp/{wildcards.sample}.pause_site.txt  
    sed 's/,/\t/g' ../results/tmp/{wildcards.sample}.pause_site.txt > {output}

    '''

SnakeMake From line 29 of rules/RPF_analysis.smk

shell:
    '''
    rm -f {log}
    # (1). Preparing the transcripts annotation files: aleady done by 
    # /home/dic/public/genomes/UCSC/mm10/RiboCode_prep.sh
    # prepare_transcripts -g <gencode.v19.annotation.gtf> -f <hg19_genome.fa> -o <RiboCode_annot>

    #####
    # (2). Selecting the length range of the RPF reads and identify the P-site locations:
    mkdir -p {params.outname}
    metaplots -a {params.annot_dir} -r {input.bamfile} -s yes -o {params.metaplot} &> {log}

    ####
    # (3). Detecting translated ORFs using the ribosome-profiling data:
    RiboCode -a {params.annot_dir} -c {output.config} -l no -g -o {params.outname} &>> {log}
    # (4). (optional) Plotting the P-sites densities of predicted ORFs
    #plot_orf_density -a <RiboCode_annot> -c <config.txt> -t (transcript_id) -s (ORF_gstart) -e (ORF_gstop) 

    ####
    # (5). (optional) Counting the number of RPF reads aligned to ORFs
    ORFcount -g {params.outname}.gtf -r {input.genome_bamfile} -f 15 -l 5 -e 100 -m 26 -M 34 -o {output.orf_count} &>> {log}
    '''

SnakeMake From line 62 of rules/RPF_analysis.smk

shell:
    "samtools index {input} {output}"

SnakeMake SAMtools From line 32 of rules/star_map.smk

library(Rsubread)
library(dplyr)
library(mgsub)

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

## Count RPFs (normalized in RPKM) on CDS for each gene, using `featureCounts`
## run all bams together
samples <- read.table(snakemake@input[["samples"]], header=T)
bamfiles <- paste0("./STAR_align/", as.vector(samples$sample),".bam")

## run one bam file
# bamfiles <- snakemake@input[["bamfile"]]
RPFcounts <- featureCounts(files=bamfiles, annot.ext=snakemake@input[['gtf']],
    isGTFAnnotationFile=TRUE, GTF.featureType="CDS", GTF.attrType="gene_id")

id_length <- RPFcounts$annotation %>% as.data.frame() %>% dplyr::select(GeneID,Length)
rownames(id_length) <- id_length$GeneID
count_table <- merge(id_length, RPFcounts$counts,by="row.names")[,-1]
mapped_reads <- RPFcounts$stat %>% dplyr::filter(Status=="Assigned")
# convert counts to RPKM
values <- mapply('/', count_table %>% summarise(across(starts_with("GSM"), ~./Length*1000*1000000)), mapped_reads[,-1])
rpkm_table <- cbind(count_table[,c(1,2)], values)
colnames(rpkm_table) <- mgsub(colnames(rpkm_table), c("RibosomeProfiling_", ".bam"), c("",""))

write.table(rpkm_table, file=snakemake@output[[1]], quote=FALSE, sep="\t", col.names=TRUE, row.names=FALSE)

R dplyr FeatureCounts Rsubread mgsub From line 2 of scripts/featureCount.R

library(riboWaltz)
library(ggplot2)
library(ggpubr)

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

annotation_dt <- create_annotation(gtfpath=snakemake@input[["gtf"]], dataSource="UCSC", organism="Mus musculus")

# load data
reads_list <- bamtolist(bamfolder=snakemake@params[["bam_folder"]], annotation=annotation_dt)
# filter by read length
# filtered_list <- length_filter(data = reads_list, length_filter_mode = "custom", length_filter_vector = 26:32)

# P-site offset calculation
POs <- psite(reads_list, flanking=6, extremity="auto")

# update reads_list with p-site info
reads_psite_list <- psite_info(reads_list, POs)

# ------ Plots to overview the data ------- #
## 1.RPF length distribution
all_samples = names(reads_list)
figs=list()
for(sample_name in all_samples){
  length_dist <- rlength_distr(reads_list, sample=sample_name)
  figs <- c(figs, list(length_dist[["plot"]]))
}
p <- ggarrange(plotlist=figs, nrow=2, ncol=2)
pdf(snakemake@output[["RPF_length_plot"]],14,12)
print(p)
dev.off()

## 2.metaheatmaps displays the abundance of the 5' and 3' end of reads on CDSs
figs=list()
for(sample_name in all_samples){
  ends_heatmap <- rends_heat(reads_list, annotation_dt, sample=sample_name, utr5l = 25, cdsl = 50, utr3l = 25)
  figs <- c(figs, list(ends_heatmap[["plot"]] + theme(plot.title=element_text(size=20))))
}
p <- ggarrange(plotlist=figs, nrow=2, ncol=2)
pdf(snakemake@output[["RPF_ends_heatmap_plot"]],20,10)
print(p)
dev.off()

## 3.P-sites per region (5’UTRs, CDSs and 3’UTRs)
figs=list()
for(sample_name in all_samples){
  psite_region <- region_psite(reads_psite_list, annotation_dt, sample=sample_name)
  figs <- c(figs, list(psite_region[["plot"]]))
}
p <- ggarrange(plotlist=figs, nrow=2, ncol=2)
pdf(snakemake@output[["psite_region_plot"]],14,12)
print(p)
dev.off()

## 4.P-sites/RPF signal on three reading frames for 5’UTRs, CDSs and 3’UTRs
### 4.1 by length
figs=list()
for(sample_name in all_samples){
  frames_stratified <- frame_psite_length(reads_psite_list, sample=sample_name, region = "all")
  figs <- c(figs, list(frames_stratified[["plot"]]))
}
p <- ggarrange(plotlist=figs, nrow=2, ncol=2)
pdf(snakemake@output[["Psite_signal_bylength_inframes_plot"]],14,12)
print(p)
dev.off()
### 4.2 in total
figs=list()
for(sample_name in all_samples){
  frames <- frame_psite(reads_psite_list, sample =sample_name, region = "all")
  figs <- c(figs, list(frames[["plot"]]))
}
p <- ggarrange(plotlist=figs, nrow=2, ncol=2)
pdf(snakemake@output[["Psite_signal_total_inframes_plot"]],14,12)
print(p)
dev.off()

## 5.metaplot to show trinucleotide periodicity along CDSs
figs=list()
for(sample_name in all_samples){
  metaprofile <- metaprofile_psite(reads_psite_list, annotation_dt, sample=sample_name, utr5l=20, cdsl=50, utr3l=20,    plot_title="sample.transcript")
  figs <- c(figs, list(metaprofile[[paste0("plot_",sample_name)]] + theme(plot.title=element_text(size=20))))
}
p <- ggarrange(plotlist=figs, nrow=2, ncol=2)
pdf(snakemake@output[["trinucleotide_periodicity_metaprofile_plot"]],24,12)
print(p)
dev.off()

## 6. codon usage
figs=list()
for(sample_name in all_samples){
  cu_barplot <- codon_usage_psite(reads_psite_list, annotation_dt, sample = sample_name, fastapath = snakemake@input[["transcript_fasta"]], fasta_genome = FALSE, frequency_normalization = TRUE) 
  figs <- c(figs, list(cu_barplot[["plot"]] + ggtitle(sample_name) +  theme(plot.title=element_text(size=20))))
}
p <- ggarrange(plotlist=figs, nrow=2, ncol=2)
pdf(snakemake@output[["codon_usage_plot"]],24,12)
print(p)
dev.off()