Biobakery Data Processing Workflow

public 1yr ago 0 bookmarks

View Workflow

A biobakery workflow pipeline for Whole Metagenome Shotgun (wmgx) data processing implemented using Snakemake rather than AnADAMA2. Programs supported thus far are Kneaddata, Metaphlan, and Humann2.

alt text

https://bitbucket.org/biobakery/biobakery_workflows/wiki/Home

How to Run

Intended to run for example data in the example-data directory. If real data is to be used, some changes are necessary.

To run on HPC cluster:

$ sh run_biobakery_workflow.sh

To run locally (for testing purposes) WARNING computational heavy :

$ /shares/hii/sw/snakemake/latest/bin/snakemake

Singularity Image can be found at:

$ /shares/hii/images/morenoj/biobakery

Code Snippets

import sys
import os
import argparse

# This script will count all features for each sample. It can ignore stratification by species
# and also "un"-features if these options are set by the user.

def parse_arguments(args):
    """ 
    Parse the arguments from the user
    """
    parser = argparse.ArgumentParser(
        description= "Count features for each species\n",
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "-i", "--input",
        help="the feature table\n[REQUIRED]",
        metavar="<input.tsv>",
        required=True)
    parser.add_argument(
        "-o", "--output",
        help="file to write counts table\n[REQUIRED]",
        metavar="<output>",
        required=True)
    parser.add_argument(
        "--reduce-sample-name",
        help="remove the extra strings from the sample names",
        action='store_true')
    parser.add_argument(
        "--ignore-un-features",
        help="do not count UNMAPPED, UNGROUPED, or UNINTEGRATED features",
        action='store_true')
    parser.add_argument(
        "--ignore-stratification",
        help="only count the main feature not the stratified instances",
        action='store_true')
    parser.add_argument(
        "--include",
        help="only count features with this string included")
    parser.add_argument(
        "--filter",
        help="do not count features with this string included")

    return parser.parse_args()


def main():

    args=parse_arguments(sys)

    data=[]
    samples=[]
    with open(args.input) as file_handle:
        # remove RPKs from sample name if included
        if args.reduce_sample_name:
            samples=[sample.replace("_Abundance-RPKs","").replace("_genefamilies_Abundance","").replace("_Abundance","").replace("_taxonomic_profile","") for sample in file_handle.readline().rstrip().split("\t")[1:]]
        else:
            samples=file_handle.readline().rstrip().split("\t")[1:]

        for line in file_handle:
            if "|" in line and args.ignore_stratification:
                store=False
            else:
                store=True

            if "UNMAPPED" in line or "UNGROUPED" in line or "UNINTEGRATED" in line and args.ignore_un_features:
                store=False

            if args.include and not args.include in line:
                store=False

            if args.filter and args.filter in line:
                store=False

            if store:
                data.append(line.rstrip().split("\t")[1:])

    try:
        file_handle=open(args.output,"w")
    except EnvironmentError:
        sys.exit("Error: Unable to open output file: " + args.output)

    # write out the header
    file_handle.write("\t".join(["# samples","total features"])+"\n")

    # count the total features for each sample
    for i, sample in enumerate(samples):
        features=0
        for row in data:
            if float(row[i]) > 0:
                features+=1
        file_handle.write(sample+"\t"+str(features)+"\n")

    file_handle.close()

if __name__ == "__main__":
    main()

Python From line 3 of biobakery-scripts/count_features.py

import sys
import os
import argparse

TOTAL_COUNT_TAG="reads; of these:"
NUCLEOTIDE_COUNT_TAG="Unaligned reads after nucleotide alignment:"
TRANSLATED_COUNT_TAG="Unaligned reads after translated alignment:"
SPECIES_COUNT_TAG="Total species selected from prescreen:"

def parse_arguments(args):
    """
    Parse the arguments from the user
    """
    parser = argparse.ArgumentParser(
        description= "Reads the HUMAnN2 logs and prints a table of read counts\n",
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "-i", "--input",
        help="the folder of log files\n[REQUIRED]",
        metavar="<input_folder>",
        required=True)
    parser.add_argument(
        "-o", "--output",
        help="file to write counts table\n[REQUIRED]",
        metavar="<output>",
        required=True)

    return parser.parse_args()

def main():
    # parse arguments from the user
    args = parse_arguments(sys)

    # search subfolders to find the log files
    log_files=[]
    for path, directories, files in os.walk(args.input):
        for file in filter(lambda x: x.endswith(".log"), files):
            log_files.append(os.path.join(path,file))

    try:
        file_handle=open(args.output,"w")
    except EnvironmentError:
        sys.exit("Error: Unable to open output file: " + args.output)

    # write out the header
    file_handle.write("\t".join(["# samples","total reads","total nucleotide aligned","total translated aligned","total species"])+"\n")

    for file in log_files:
        sample=os.path.basename(file).split(".log")[0]
        data=[sample,"NA","NA","NA","NA"]
        for line in open(file):
            if TOTAL_COUNT_TAG in line:
                print(line)
                print (line.split())
                data[1]=int(line.split()[7])
            elif NUCLEOTIDE_COUNT_TAG in line:
                try:
                    data[2]=int(data[1]*((100-float(line.split()[-2]))/100.0))
                except (ValueError, TypeError):
                    print("Warning: Unable to compute nucleotide reads aligned from log line: " + line)
            elif TRANSLATED_COUNT_TAG in line:
                try:
                    data[3]=int(data[1]*((100-float(line.split()[-2]))/100.0))
                except (ValueError, TypeError):
                    print("Warning: Unable to compute translated reads aligned from log line: " + line)
            elif SPECIES_COUNT_TAG in line:
                data[4]=line.split()[-1]

        file_handle.write("\t".join([str(i) for i in data])+"\n")

    file_handle.close()
    print("Read table written to file: " + args.output)

if __name__ == "__main__":
    main()

Python From line 7 of biobakery-scripts/get_counts_from_humann2_logs.py

shell:
    """
    {SINGULARITY} kneaddata \
        --input {input.R1} \
        --input {input.R2} \
        --reference-db {BOWTIE_GRCH38_BUILD} \
        --bowtie2 {BOWTIE_DIR} \
        --trimmomatic {TRIMMOMATIC_DIR} \
        --output {params.directory}

    cat {params}/*_paired_*.fastq >> {output}

    """

SnakeMake From line 45 of master/Snakefile

shell:
    """
    find output/kneaddata_output/ -name "*.log" | while read i; do \
        mv $i output/kneaddata_output/logs/; done

    {SINGULARITY} kneaddata_read_count_table \
        --input {KNEADDATA_LOGS} \
        --output {output}
    """

SnakeMake kneaddata From line 64 of master/Snakefile

shell:
    """
    {SINGULARITY} python {METAPHLAN_PY} {input} \
        --input_type fastq \
        --bowtie2_exe {BOWTIE2_EXEC} \
        --bowtie2_build {BOWTIE2_BUILD} \
        --samout output/metaphlan_analysis/$(basename {input}).sam \
        --output {output}
    """

SnakeMake From line 80 of master/Snakefile

shell:
    """
    {SINGULARITY} python {METAPHLAN_DIR}/utils/merge_metaphlan_tables.py {input} \
        > {output}
    """

SnakeMake From line 96 of master/Snakefile

shell:
    """
    {SINGULARITY} humann2 -i {input.fastq} \
        --bowtie2 {BOWTIE_DIR} \
        --metaphlan {METAPHLAN_DIR} \
        --taxonomic-profile {input.profile} \
        --nucleotide-database {NUCLEO_DB} \
        --protein-database {PROTEIN_DB} \
        --threads 4 \
        -o {params}
    """

SnakeMake From line 110 of master/Snakefile

shell:
    """
    find output/humann2_analysis/ -name "*_pathabundance.tsv" | while read i; do \
        echo $i
        echo $(basename $i)
        {SINGULARITY} humann2_renorm_table -i $i \
            --units relab \
            --output {params}/$(basename $i); \
    done

    {SINGULARITY} humann2_join_tables \
        --input output/humann2_analysis/path_Abundance_Norm \
        --output output/humann2_analysis/path_Abundance_Norm/pathAbun_merged_tables.tsv


    find output/humann2_analysis/ -name "*_genefamilies.tsv" | while read i; do \
        {SINGULARITY} humann2_regroup_table -i $i \
            -c {UTILITY_MAPPING} \
            --output output/humann2_analysis/geneFamilies_Grouped/$(basename $i)_ecs.tsv; \
    done

    find output/humann2_analysis/ -name "*_genefamilies.tsv" | while read i; do \
        {SINGULARITY} humann2_renorm_table -i $i \
            --units relab \
            --output output/humann2_analysis/geneFamilies_Norm/$(basename $i)_genefamilies_norm.tsv; \
    done



    find output/humann2_analysis/ -name "*ecs.tsv" | while read i; do \
        {SINGULARITY} humann2_renorm_table -i $i \
            --units relab \
            --output output/humann2_analysis/geneFamilies_Grouped_Norm/$(basename $i)_ecs_norm.tsv; \
    done

    {SINGULARITY} humann2_join_tables \
        --input output/humann2_analysis/geneFamilies_Grouped_Norm \
        --output output/humann2_analysis/geneFamilies_Grouped_Norm/EC_merged.tsv



    {SINGULARITY} humann2_join_tables \
        --input output/humann2_analysis/geneFamilies_Norm \
        --output output/humann2_analysis/geneFamilies_Norm/geneFam_merged_tables.tsv


    {SINGULARITY} python biobakery-scripts/count_features.py \
        -i output/humann2_analysis/geneFamilies_Norm/geneFam_merged_tables.tsv \
        --reduce-sample-name --ignore-un-features --ignore-stratification \
        -o vis_inputs/humann2/merged/geneFamilies_counts.tsv

    {SINGULARITY} python biobakery-scripts/count_features.py \
        -i output/humann2_analysis/geneFamilies_Grouped_Norm/EC_merged.tsv \
        --reduce-sample-name --ignore-un-features --ignore-stratification \
        -o vis_inputs/humann2/merged/EC_counts.tsv

    {SINGULARITY} python biobakery-scripts/count_features.py \
        -i output/humann2_analysis/path_Abundance_Norm/pathAbun_merged_tables.tsv \
        --reduce-sample-name --ignore-un-features --ignore-stratification \
        -o vis_inputs/humann2/merged/pathabundance_relab.tsv

    {SINGULARITY} humann2_join_tables -i vis_inputs/humann2/merged \
        --file_name _counts.tsv \
        -o vis_inputs/humann2/counts/humann2_feature_counts.tsv

    find output/humann2_analysis/ -name "*_pathcoverage.tsv" | while read i; do \
        cp $i output/humann2_analysis/path_Coverage/; done

    touch vis_inputs/anadama.log

    find output/humann2_analysis/ -name "*.log" | while read i; do \
        cp $i output/humann2_analysis/logs/; done

    {SINGULARITY} python biobakery-scripts/get_counts_from_humann2_logs.py \
        --input output/humann2_analysis/logs \
        --output vis_inputs/humann2/counts/humann2_read_and_species_count_table.tsv

    """