Quantitative shotgun MS proteomics

public public 1yr ago Version: dev 0 bookmarks

Quantitative shotgun MS proteomics as done in Lehtio lab

This pipeline is no longer being maintained

Please see nf-core/quantms for a more up to date pipeline that covers much of the same functionality.

Introduction

The pipeline is built using Nextflow , a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker / singularity containers making installation trivial and results highly reproducible.

Documentation

The nf-core/ddamsproteomics pipeline comes with documentation about the pipeline, found in the docs/ directory:

  1. Installation

  2. Pipeline configuration

  3. Running the pipeline

  4. Output and how to interpret the results

  5. Troubleshooting

Code Snippets

235
236
237
238
239
240
241
242
243
244
245
246
"""
echo $workflow.manifest.version > v_pipeline.txt
echo $workflow.nextflow.version > v_nextflow.txt
msgf_plus | head -n1 > v_msgf.txt
hardklor | head -n1 > v_hk.txt || true
kronik | head -n2 > v_kr.txt
percolator -h |& head -n1 > v_perco.txt || true
msspsmtable --version > v_mss.txt
source activate openms-2.4.0
IsobaricAnalyzer |& grep Version > v_openms.txt || true
scrape_software_versions.py > software_versions_mqc.yaml
"""
313
314
315
316
317
318
319
320
"""
# Run hardklor on config file with added line for in/out files
# then run kronik on hardklor and quant isobaric labels if necessary
hardklor <(cat $hkconf <(echo "$infile" hardklor.out))
kronik -c 5 -d 3 -g 1 -m 8000 -n 600 -p 10 hardklor.out ${sample}.kr
source activate openms-2.4.0
${params.isobaric ? "IsobaricAnalyzer  -type $params.isobaric -in $infile -out \"${infile}.consensusXML\" -extraction:select_activation \"$activationtype\" -extraction:reporter_mass_shift $massshift -extraction:min_precursor_intensity 1.0 -extraction:keep_unannotated_precursor true -quantification:isotope_correction true" : ''}
"""
343
344
345
"""
msslookup spectra -i ${mzmlfiles.join(' ')} --setnames ${setnames.join(' ')}
"""
397
398
399
400
401
402
"""
# SQLite lookup needs copying to not modify the input file which would mess up a rerun with -resume
cat $lookup > db.sqlite
msslookup ms1quant --dbfile db.sqlite -i ${krfns.join(' ')} --spectra ${mzmls.join(' ')} --quanttype kronik --mztol 20.0 --mztoltype ppm --rttol 5.0 
msslookup isoquant --dbfile db.sqlite -i ${isofns.join(' ')} --spectra ${isosamples.collect{ x -> x + '.mzML' }.join(' ')}
"""
NextFlow From line 397 of master/main.nf
404
405
406
407
408
"""
# SQLite lookup needs copying to not modify the input file which would mess up a rerun with -resume
cat $lookup > db.sqlite
msslookup ms1quant --dbfile db.sqlite -i ${krfns.join(' ')} --spectra ${mzmls.join(' ')} --quanttype kronik --mztol 20.0 --mztoltype ppm --rttol 5.0 
"""
NextFlow From line 404 of master/main.nf
431
432
433
"""
sqlite3 $speclookup "SELECT mzmlfilename, COUNT(*) FROM mzml JOIN mzmlfiles USING(mzmlfile_id) JOIN biosets USING(set_id) GROUP BY mzmlfilename" > amount_spectra_files
"""
NextFlow From line 431 of master/main.nf
459
460
461
462
463
464
465
466
467
468
469
470
471
"""
#!/usr/bin/env python
platesets = [\"${splates.join('", "')}\"]
platescans = {p: 0 for p in platesets}
fileplates = {fn: p for fn, p in zip([\"${mzmlfiles.join('", "')}\"], platesets)}
with open('nr_spec_per_file') as fp:
    for line in fp:
        fn, scans = line.strip('\\n').split('|')
        platescans[fileplates[fn]] += int(scans)
with open('scans_per_plate', 'w') as fp:
    for plate, scans in platescans.items():
        fp.write('{}\\t{}\\n'.format(plate, scans))
"""
NextFlow From line 459 of master/main.nf
492
493
494
495
"""
tryprev.py $tdb
cat $tdb decoy_${tdb} > db.fa
"""
NextFlow From line 492 of master/main.nf
513
514
515
516
517
"""
msgf_plus -Xmx16G -d $db -s $x -o "${sample}.mzid" -thread 12 -mod $mods -tda 0 -t 10.0ppm -ti -1,2 -m 0 -inst ${msgfinstrument} -e 1 -protocol ${msgfprotocol} -ntt 2 -minLength 7 -maxLength 50 -minCharge 2 -maxCharge 6 -n 1 -addFeatures 1
msgf_plus -Xmx3500M edu.ucsd.msjava.ui.MzIDToTsv -i "${sample}.mzid" -o out.mzid.tsv
rm ${db.baseName.replaceFirst(/\.fasta/, "")}.c*
"""
533
534
535
536
537
538
539
"""
echo $samples
mkdir mzids
count=1;for sam in ${samples.join(' ')}; do ln -s `pwd`/mzid\$count mzids/\${sam}.mzid; echo mzids/\${sam}.mzid >> metafile; ((count++));done
msgf2pin -o percoin.xml -e trypsin -P "decoy_" metafile
percolator -j percoin.xml -X perco.xml -N 500000 --decoy-xml-output -y
"""
NextFlow From line 533 of master/main.nf
559
560
561
"""
perco_to_tsv.py -p $perco --plates ${platenames.join(' ')} --fractions ${fractions.join(' ')}
"""
NextFlow From line 559 of master/main.nf
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
"""
msspsmtable merge -i psms* -o psms.txt
msspsmtable conffilt -i psms.txt -o filtpsm --confidence-better lower --confidence-lvl 0.01 --confcolpattern 'PSM q-value'
msspsmtable conffilt -i filtpsm -o filtpep --confidence-better lower --confidence-lvl 0.01 --confcolpattern 'peptide q-value'
# SQLite lookup needs copying to not modify the input file which would mess up a rerun with -resume
cat lookup > $psmlookup
msslookup psms -i filtpep --dbfile $psmlookup ${params.onlypeptides ? '' : "--fasta ${td == 'target' ? tdb : "${ddb} --decoy"}"} ${params.martmap ? "--map ${martmap}" : ''}
msspsmtable specdata -i filtpep --dbfile $psmlookup -o prepsms.txt
${!params.noquant ? "msspsmtable quant -i prepsms.txt -o qpsms.txt --dbfile $psmlookup --precursor ${params.isobaric && td=='target' ? '--isobaric' : ''}" : 'mv prepsms.txt qpsms.txt'}
sed 's/\\#SpecFile/SpectraFile/' -i qpsms.txt
${!params.onlypeptides ? "msspsmtable genes -i qpsms.txt -o gpsms --dbfile $psmlookup" : ''}
${!params.onlypeptides ? "msslookup proteingroup -i qpsms.txt --dbfile $psmlookup" : ''}
${!params.onlypeptides ? "msspsmtable proteingroup -i gpsms -o ${params.hirief ? "pgpsms" : "$outpsms"} --dbfile $psmlookup" : 'mv qpsms.txt pgpsms'}
${params.hirief ? "peptide_pi_annotator.py -i $trainingpep -p pgpsms --o $outpsms --stripcolpattern Strip --pepcolpattern Peptide --fraccolpattern Fraction --strippatterns ${allstrips.join(' ')} --intercepts ${allstrips.collect() { params.strips[it].intercept}.join(' ')} --widths ${allstrips.collect() { params.strips[it].fr_width}.join(' ')} --ignoremods \'*\'" : ''}
msspsmtable split -i ${outpsms} --bioset
"""
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
"""
# Create peptide table from PSM table, picking best scoring unique peptides
msspeptable psm2pep -i psms -o peptides --scorecolpattern svm --spectracol 1 ${!params.noquant && params.isobaric && td == 'target' ? "--isobquantcolpattern plex" : "" } ${!params.noquant ? "--ms1quantcolpattern area" : ""}
# Move peptide sequence to first column
paste <( cut -f ${col} peptides) <( cut -f 1-${col-1},${col+1}-500 peptides) > peptide_table.txt
# Create empty protein/gene/gene-symbol tables with only the identified accessions, will be filled later
echo Protein accession |tee proteins genes symbols
tail -n+2 psms|cut -f ${accolmap.proteins}|grep -v '\\;'|grep -v "^\$"|sort|uniq >> proteins
tail -n+2 psms|cut -f ${accolmap.genes}|grep -v '\\;'|grep -v "^\$"|sort|uniq >> genes
tail -n+2 psms|cut -f ${accolmap.assoc}|grep -v '\\;'|grep -v "^\$"|sort|uniq >> symbols
# Do isobaric quantification if necessary
${normalize && td == 'target' ? "msspsmtable isoratio -i psms -o proteinratios --protcol ${accolmap.proteins} --targettable proteins --isobquantcolpattern plex --minint 0.1 --denompatterns ${setdenoms[setname].join(' ')}" : 'touch proteinratios'}
${isoquant ? "msspsmtable isoratio -i psms -o pepisoquant --targettable peptide_table.txt --protcol ${accolmap.peptides} --isobquantcolpattern plex --minint 0.1 --denompatterns ${setdenoms[setname].join(' ')} ${normalize ? '--normalize median --norm-ratios proteinratios' : ''} > normratiosused" : ''}
${isoquant ? "mv pepisoquant peptide_table.txt" : ''}
# Create linear modeled q-values of peptides (modeled svm scores vs q-values) for more protein-FDR precision.
msspeptable modelqvals -i peptide_table.txt -o ${setname}_linmod --scorecolpattern svm --fdrcolpattern '^q-value'
"""
NextFlow From line 652 of master/main.nf
716
717
718
719
720
"""
mssprottable ms1quant -i proteins -o protms1 --psmtable psms --protcol ${accolmap[acctype]}
msspsmtable isoratio -i psms -o proteintable --protcol ${accolmap[acctype]} --targettable protms1 --isobquantcolpattern plex --minint 0.1 --denompatterns ${setdenoms[setname].join(' ')} ${normalize && td == 'target' ? '--norm-ratios pratios --normalize median': ''}
mssprottable bestpeptide -i proteintable -o bestpeptides --peptable peplinmod --scorecolpattern ${acctype == 'proteins' ? '\'^q-value\'' : '\'linear model\''} --logscore --protcol ${accolmap[acctype] + 1}
"""
NextFlow From line 716 of master/main.nf
722
723
724
725
"""
${td == 'target' && !params.noquant ? "mssprottable ms1quant -i proteins -o proteintable --psmtable psms --protcol ${accolmap[acctype]}" : 'mv proteins proteintable'}
mssprottable bestpeptide -i proteintable -o bestpeptides --peptable peplinmod --scorecolpattern ${acctype == 'proteins' ? '\'^q-value\'' : '\'linear model\''} --logscore --protcol ${accolmap[acctype] + 1}
"""
NextFlow From line 722 of master/main.nf
748
749
750
"""
mssprottable pickedfdr --picktype fasta --targetfasta $tfasta --decoyfasta $dfasta ${params.fastadelim ? "--fastadelim \'${params.fastadelim}\' --genefield ${params.genefield}" : ''} -i tbestpep --decoyfn dbestpep -o ${setname}_protfdr
"""
NextFlow From line 748 of master/main.nf
752
753
754
"""
mssprottable ${acctype == 'proteins' ? 'protfdr' : 'pickedfdr --picktype result'} -i tbestpep --decoyfn dbestpep -o ${setname}_protfdr
"""
NextFlow From line 752 of master/main.nf
798
799
800
801
802
803
804
"""
# SQLite lookup needs copying to not modify the input file which would mess up a rerun with -resume
cat $lookup > db.sqlite
msslookup ${acctype == 'peptides' ? 'peptides --fdrcolpattern \'^q-value\' --peptidecol' : 'proteins --fdrcolpattern \'q-value\' --protcol'} 1 --dbfile db.sqlite -i ${tables.join(' ')} --setnames ${setnames.join(' ')} ${!params.noquant ? "--ms1quantcolpattern area" : ""}  ${!params.noquant && params.isobaric ? '--psmnrcolpattern quanted --isobquantcolpattern plex' : ''} ${acctype in ['genes', 'assoc'] ? "--genecentric ${acctype}" : ''}
${acctype == 'peptides' ? 'msspeptable build' : 'mssprottable build --mergecutoff 0.01'} --dbfile db.sqlite -o proteintable ${!params.noquant && params.isobaric ? '--isobaric' : ''} ${!params.noquant ? "--precursor": ""} --fdr ${acctype in ['genes', 'assoc'] ? "--genecentric ${acctype}" : ''} ${params.onlypeptides ? "--noncentric" : ''}
sed -i 's/\\#/Amount/g' proteintable
"""
NextFlow From line 798 of master/main.nf
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
"""
qc_psms.R ${setnames[0].size()} ${fractionation ? 'TRUE' : 'FALSE'} ${plates.join(' ')}
echo "<html><body>" > psmqc.html
for graph in psm-scans missing-tmt miscleav
  do
  [[ -e \$graph ]] && paste -d \\\\0  <(echo "<div class=\\"chunk\\" id=\\"\${graph}\\"><img src=\\"data:image/png;base64,") <(base64 -w 0 \$graph) <(echo '"></div>') >> psmqc.html
  done 
for graph in retentiontime precerror fryield msgfscore
  do
  for plateid in ${plates.join(' ')}
    do
    plate="PLATE___\${plateid}___\${graph}"
    [[ -e \$plate ]] && paste -d \\\\0  <(echo "<div class=\\"chunk \$plateid\\" id=\\"\${graph}\\"><img src=\\"data:image/png;base64,") <(base64 -w 0 \$plate) <(echo '"></div>') >> psmqc.html
    done 
  done
echo "</body></html>" >> psmqc.html
"""
NextFlow From line 823 of master/main.nf
867
868
869
870
871
872
873
874
875
876
"""
${normalize ? "count=1;for setn in ${setnames.join(' ')}; do echo '' >> norm\${count} ; tail -n+2 norm\${count} | sed \$'s/ - /\t'\${setn}\$'\t/'; ((count++)); done >> normtable" : ''}
qc_protein.R ${setnames.size()} ${acctype} $peptable ${normalize ? 'normtable' : ''}
echo "<html><body>" > featqc.html
for graph in featyield precursorarea coverage isobaric nrpsms nrpsmsoverlapping percentage_onepsm normfac ms1nrpeps;
  do
  [ -e \$graph ] && paste -d \\\\0  <(echo "<div class=\\"chunk\\" id=\\"\${graph}\\"><img src=\\"data:image/png;base64,") <(base64 -w 0 \$graph) <(echo '"></div>') >> featqc.html
  done 
echo "</body></html>" >> featqc.html
"""
NextFlow From line 867 of master/main.nf
898
899
900
901
"""
count=1; for ac in ${acctypes.join(' ')}; do mv feat\$count \$ac.html; ((count++)); done
qc_collect.py $params.name ${params.hirief ? "hirief" : "nofrac"} ${plates.join(' ')}
"""
NextFlow From line 898 of master/main.nf
920
921
922
"""
markdown_to_html.r $output_docs results_description.html
"""
NextFlow From line 920 of master/main.nf
ShowHide 19 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://nf-co.re/ddamsproteomics
Name: ddamsproteomics
Version: dev
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...