Plant genome assembly and annotation pipeline using snakemake

public 1yr ago 0 bookmarks

View Workflow

assembly-annotation-pipeline — View Workflow

This Snakemake pipeline aims for plant genome assembly with HiFi data and genome annotation with RNA-Seq/IsoSeq.

Assembly

Modified from the VGP assembly VGP tutorial

Assembly

Annotation

Maker running inside snakemake still have the problem since the NSF lock and submit system is already included.
IsoSeq integration is still in development. For diploid or polyploidy, FLNC mapping is better than the clustering of the PacBio pipeline if you have haplotype-resolved assembly.
Automatic pipeline always confused the complex gene clusters (metabolic gene clusters / NLR gene clusters), please manually curation it before any further interpretation. WebApollo or IGV-GSAman could be a good start for manual curation. (https://www.bilibili.com/video/BV1x84y1z7ZW/)

Annotation

Code Snippets

shell:
    """
    {module}
    {asm}
    samtools faidx {input.fa}
    ln -sf {params.fa} {output.fa}
    ln -sf {params.fa} {output.ref}
    ln -sf {params.fai} {output.fai}
    ln -sf {params.fai} {output.ref_fai}
    """

SnakeMake SAMtools From line 17 of rules/01.EDTA.smk

shell:
    """
    {module}
    {EDTA}
    cd {params.DIR} && EDTA_raw.pl --genome {wildcards.name}.fa --species {species} --type ltr --threads {threads}
    """

SnakeMake From line 40 of rules/01.EDTA.smk

shell:
    """
    {module}
    {EDTA}
    cd {params.DIR} 
    EDTA_raw.pl --genome {wildcards.name}.fa --species {species} --type tir --threads {threads}
    """

SnakeMake From line 58 of rules/01.EDTA.smk

shell:
    """
    {module}
    {EDTA}
    cd {params.DIR} && EDTA_raw.pl --genome {wildcards.name}.fa --species {species} --type helitron --threads {threads}
    """

SnakeMake From line 78 of rules/01.EDTA.smk

run:
    #cmd = EDTA + " && " + RepeatMasker + " && " + RepeatModeler + " && " + blast + " && "
    cmd = module + "&&" + EDTA + " && " 
    cmd = cmd + " cd " + params.DIR + "  && EDTA.pl --genome " + wildcards.name + ".fa  --species " + species
    if lib :
       cmd = cmd + " --curatedlib " + lib
    if cds :
       cmd = cmd + " --cds " + cds
    cmd = cmd + " --sensitive " + str(sensitive) + " --anno 1  --step " + step + " -t " + str(threads) 
    #cmd = cmd + " --blast " + params.blast + " --repeatmasker " + params.masker + " --repeatmodeler " + params.modeler + " -protlib " + params.problib  
    shell("{cmd}")

SnakeMake BLAST RepeatMasker RepeatModeler From line 103 of rules/01.EDTA.smk

shell:
    """
    {module}
    {EDTA}
    grep -v -P "Satellite|rich|Simple_repeat|Low_complexity|tRNA|rRNA" {input.gff} | bedtools maskfasta -fi {input.fa} -bed - -fo {output.soft} -soft
    grep -v -P "Satellite|rich|Simple_repeat|Low_complexity|tRNA|rRNA" {input.gff} | bedtools maskfasta -fi {input.fa} -bed - -fo {output.hard} 
    """

SnakeMake BEDTools From line 125 of rules/01.EDTA.smk

shell:
    """
    {module}
    {QC} || echo "true"
    fastp -a auto --adapter_sequence_r2 auto --detect_adapter_for_pe -w {threads} -i {input.R1} -I {input.R2} --n_base_limit {params.nbase} --cut_window_size {params.window} --cut_mean_quality {params.mean_qual} --length_required {params.length} --qualified_quality_phred {params.quality} -o {output.R1} -O {output.R2} --json {output.json} --html {output.html} >&2 2>{log}
    """

SnakeMake fastp From line 24 of rules/02.braker.smk

shell:
    """
    md5sum {input.R1} 1> {output.md5_1}
    md5sum {input.R2} 1> {output.md5_2}
    """

SnakeMake From line 42 of rules/02.braker.smk

shell:
    """
    {module}
    {QC}
    fastqc  -t {threads} {input.R1} {input.R2} -o {params.outdir} >&2 2>{log}
    """

SnakeMake FastQC From line 65 of rules/02.braker.smk

shell:
    """
    {mapping}
    cp ctl/.gm_key ~/
    cp -r /public/home/baozhigui/miniconda3/envs/braker2/config/ ./
    hisat2-build -p {threads} {input.ref} {params.DIR}/{wildcards.name}
    """ 

SnakeMake HISAT2 From line 98 of rules/02.braker.smk

shell:
    """
    {mapping}
    hisat2 --dta -x {params.index} -1 {input.R1} -2 {input.R2} --rna-strandness RF --summary-file {output.summary} --new-summary -p {threads} | samtools view -@ 4 -Sb - | samtools sort -@ 6 -o {output.bam} - >&2 2>{log} 
    """ 

SnakeMake SAMtools HISAT2 From line 121 of rules/02.braker.smk

shell:
    """
    {mapping}
    stringtie -p {threads} --rf -l {wildcards.sample} -o {output.gtf} {input.bam}
    gffread -E {output.gtf} -o - | sed "s#transcript#match#g" | sed "s#exon#match_part#g" > {output.gff3}
    """ 

SnakeMake gffread StringTie From line 136 of rules/02.braker.smk

shell:
    """
    {mapping}
    hisat2-build -p {threads} {input.ref} {params.DIR}/hardmask.{wildcards.name}
    """ 

SnakeMake HISAT2 From line 156 of rules/02.braker.smk

shell:
    """
    {mapping}
    hisat2 --dta -x {params.index} -1 {input.R1} -2 {input.R2} --rna-strandness RF --summary-file {output.summary} --new-summary -p {threads} | samtools view -@ 4 -Sb - | samtools sort -@ 6 -o {output.bam} - >&2 2>{log} 
    """ 

SnakeMake SAMtools HISAT2 From line 177 of rules/02.braker.smk

shell:
    """
    {mapping}
    minimap2 -ax splice:hq -uf {input.ref} {input.fa} -t {threads} | samtools view -@ 4 -Shb - | samtools sort -@ 4 -o {output.bam}
    """ 

SnakeMake SAMtools Minimap2 From line 196 of rules/02.braker.smk

run:
    bam = ",".join(input.bam) + "," + ",".join(input.iso_bam)
    shell("{module} && {genemark} && {braker2} && braker.pl --species=braker_{name} --genome={input.ref} --workingdir={params.DIR} --gff3 --AUGUSTUS_CONFIG_PATH={augustus_conf}  --nocleanup --softmasking --bam={bam} --cores {threads}")

SnakeMake From line 216 of rules/02.braker.smk

run:
    bam = ",".join(input.bam)
    shell("{module} && {genemark} && {braker2} && braker.pl --species=braker_{name} --genome={input.ref} --workingdir={params.DIR} --gff3 --AUGUSTUS_CONFIG_PATH={augustus_conf}  --nocleanup --softmasking --bam={bam} --cores {threads}")

SnakeMake From line 233 of rules/02.braker.smk

shell:
    """
    {module}
    {maker}
    mkdir -p results/03.annotation/06.maker/round1 && cd results/03.annotation/06.maker/round1
    cp ../../../../ctl/.gm_key ~/
    cp ../../../../ctl/maker/round1/*.ctl ./
    num=`ls {outdir}/results/03.annotation/04.RNA-seq/stringtie/*.gff3|wc -l`
    if [ $num -gt 1 ];then
        gff=`ls {outdir}/results/03.annotation/04.RNA-seq/stringtie/*.gff3|tr '\\n' ','|sed "s/,$//g"`
    else
        gff=`ls {outdir}/results/03.annotation/04.RNA-seq/stringtie/*.gff3`
    fi
    echo $gff

    sed -i 's#^genome=#genome={params.fasta}#g' maker_opts.ctl
    sed -i -e "s|^est_gff=|est_gff=$gff|g" maker_opts.ctl
    sed -i 's#^protein=#protein={params.pro}#g' maker_opts.ctl
    sed -i 's#^rmlib=#rmlib={params.te}#g' maker_opts.ctl
    mpiexec -n {threads} maker -base {wildcards.name} maker_bopts.ctl maker_exe.ctl maker_opts.ctl  >&2 2>{log}
    cd {wildcards.name}.maker.output
    gff3_merge -d ./{wildcards.name}_master_datastore_index.log -o {wildcards.name}.maker.gff 
    fasta_merge -d ./{wildcards.name}_master_datastore_index.log -o {wildcards.name}
    """

SnakeMake GFF3 Rebase From line 20 of rules/03.maker.smk

shell:
    """
    mkdir -p results/03.annotation/06.maker/round2/Training/SNAP
    cd results/03.annotation/06.maker/round2/Training/SNAP
    {module}
    {maker} 
    maker2zff -x {params.aed} ../../../round1/{wildcards.name}.maker.output/{wildcards.name}.maker.gff
    # gather some stats and validate
    fathom genome.ann genome.dna -gene-stats > gene-stats.log 2>&1
    fathom genome.ann genome.dna -validate > validate.log 2>&1
    # collect the training sequences and annotations, plus 1000 surrounding bp for training
    fathom genome.ann genome.dna -categorize 1000 > categorize.log 2>&1
    fathom uni.ann uni.dna -export 1000 -plus > uni-plus.log 2>&1
    mkdir params
    cd params
    forge ../export.ann ../export.dna > ../forge.log 2>&1
    cd ..
    hmm-assembler.pl genome params > {params.hmm}
    """

SnakeMake From line 53 of rules/03.maker.smk

shell:
    """
    {module}
    {maker}
    cp -r results/03.annotation/05.braker_{wildcards.name}/species/* config/species/
    export AUGUSTUS_CONFIG_PATH={params.config}
    mkdir -p results/03.annotation/06.maker/round2 && cd results/03.annotation/06.maker/round2
    cp ../../../../ctl/maker/round2/*.ctl ./
    sed -i 's#^genome=#genome={params.fasta}#g' maker_opts.ctl
    sed -i 's#^maker_gff=#maker_gff={params.gff}#g' maker_opts.ctl
    sed -i "s#^snaphmm=#snaphmm={params.snap}#g" maker_opts.ctl
    sed -i "s#^gmhmm=#gmhmm={params.gm}#g" maker_opts.ctl
    sed -i "s#^augustus_species=#augustus_species=braker_{wildcards.name}#g" maker_opts.ctl
    mpiexec -n {threads} maker -base {wildcards.name} maker_bopts.ctl maker_exe.ctl maker_opts.ctl  >&2 2> maker_round2.log
    gff3_merge -d ./{wildcards.name}.maker.output/{wildcards.name}_master_datastore_index.log -o {wildcards.name}.maker.all.gff
    awk '$2 == "maker"' {wildcards.name}.maker.all.gff > {wildcards.name}.maker.only.gff
    fasta_merge -d ./{wildcards.name}.maker.output/{wildcards.name}_master_datastore_index.log -o {wildcards.name}
    """