BioWorkflows: Gezelvirus Workflow: Studying a Polinton-like Virus in Phaeocystis globosa

shell:
    "efetch -db nuccore -id {params.id} -format fasta > {output}"

SnakeMake From line 11 of workflow/Get_viruses.snakefile

shell:
    "efetch -db nuccore -id {params.id} -format gb > {output}"

SnakeMake From line 23 of workflow/Get_viruses.snakefile

shell:
    "cp {input} {output}"

SnakeMake From line 31 of workflow/Get_viruses.snakefile

shell:
    "mv {input} {output}"

SnakeMake From line 39 of workflow/Get_viruses.snakefile

shell:
    """
    prokka --force --cpus {threads} --prefix {wildcards.genome} --locustag {params.locustag} --kingdom Viruses --outdir {params.outdir}/{wildcards.genome} {input}
    cp {params.outdir}/{wildcards.genome}/{wildcards.genome}.gbk {output}
    """

SnakeMake metaprokka From line 55 of workflow/Get_viruses.snakefile

shell:
    "seqkit grep -rp {params.search} {input} | seqkit replace -p '$' -r ' {wildcards.segment}' -o {output}"

SnakeMake seqkit From line 70 of workflow/Get_viruses.snakefile

shell:
    "awk -vl={wildcards.segment} '/^LOCUS/{{p=$2==l}}p' {input} > {output}"

SnakeMake From line 80 of workflow/Get_viruses.snakefile

shell:
    "perl workflow/helpers/biotags.pl -i {input} -p CDS -T id -t locus_tag,seq | awk '{{printf\"%s %s\\t%s\\n\",$2,$1,$3}}' | seqkit tab2fx | seqkit translate --trim -o {output}"

SnakeMake seqkit From line 90 of workflow/Get_viruses.snakefile

shell:
    "awk -vl={wildcards.segment} '/^LOCUS/{{p=$2==l}}p' {input} | sed -E 's/\\x93|\\x94/\"/g' > {output}"

SnakeMake From line 100 of workflow/Get_viruses.snakefile

shell:
    "perl workflow/helpers/biotags.pl -i {input} -p CDS -T id -t locus_tag,translation | awk '{{printf\"%s %s\\t%s\\n\",$2,$1,$3}}' | seqkit tab2fx -o {output}"

SnakeMake seqkit From line 110 of workflow/Get_viruses.snakefile

shell:
    "perl workflow/helpers/biotags.pl -i {input} -p CDS -T description -t locus_tag,translation | awk -F\\\\t '$2{{printf\"%s %s\\t%s\\n\",$2,$1,$3}}' | seqkit tab2fx -o {output}"

SnakeMake seqkit From line 120 of workflow/Get_viruses.snakefile

shell:
    "perl workflow/helpers/biotags.pl -i {input} -p CDS -T description -t protein_id,translation | awk -F\\\\t '$2{{printf\"%s %s\\t%s\\n\",$2,$1,$3}}' | seqkit tab2fx -o {output}"

SnakeMake seqkit From line 130 of workflow/Get_viruses.snakefile

shell:
    "perl workflow/helpers/biotags.pl -i {input} -T id,seq | seqkit tab2fx -o {output}"

SnakeMake seqkit From line 140 of workflow/Get_viruses.snakefile

shell:
    "perl workflow/helpers/biotags.pl -i {input} -p CDS -t locus_tag,translation | awk '$1' | seqkit tab2fx -o {output}"

SnakeMake seqkit From line 150 of workflow/Get_viruses.snakefile

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))

stdin_fd <- file("stdin")
all.lines <- readLines(stdin_fd)
close(stdin_fd)
param.start <- 1
data.start  <- which(grepl("^ *No Hit", all.lines)) %>% first %>% `+`(1)

align.start <- which(grepl("^No 1", all.lines)) %>% first

param.end <- data.start - 2
data.end  <- align.start - 1
align.end <- length(all.lines)

if (is.na(align.start)) {
    data <- tibble(
        Query           = character(),
        No              =   integer(),
        Hit.ID          = character(),
        Hit.Description = character(),
        Q.ss_pred       = character(),
        Q.query         = character(),
        Q.consensus     = character(),
        Q.Start         =   integer(),
        Q.End           =   integer(),
        Q.Length        =   integer(),
        T.consensus     = character(),
        T.Start         =   integer(),
        T.End           =   integer(),
        T.Length        =   integer(),
        T.hit           = character(),
        T.ss_dssp       = character(),
        T.ss_pred       = character(),
        Aligned_cols    =   integer(),
        E.value         =   numeric(),
        Identities      =   numeric(),
        Probab          =   numeric(),
        Score           =   numeric(),
        Similarity      =   numeric(),
        Sum_probs       =   numeric(),
        Template_Neff   =   numeric()
    )
} else {
    metadata <- data.frame(key = all.lines[param.start:param.end]) %>%
        mutate(value = substr(key, 14, 10000) %>% trimws, key = substr(key, 1, 14) %>% trimws) %>%
        filter(key != "") %>%
        {setNames(.$value, .$key)} %>%
        as.list
    data <- data.frame(Query = sub(" .*", "", metadata$Query), line = all.lines[align.start:align.end], stringsAsFactors = F) %>%
        filter(line != "") %>%
        extract(line, into = c("name", "value"), regex = "([^ ]+) ?(.+)?", remove = F) %>%
        mutate(No = ifelse(name == "No", value, NA) %>% as.integer) %>%
        mutate(Hit.ID = ifelse(substr(name, 1, 1) == ">", substr(name, 2, nchar(.)), NA)) %>%
        mutate(Hit.Description = ifelse(substr(name, 1, 1) == ">", value, NA)) %>%
        mutate(Match = ifelse(grepl("=", name), line, NA)) %>%
        mutate(name = ifelse(grepl("Q Consensus", lag(line)) & grepl("T Consensus", lead(line)), "M", name)) %>%
        mutate(value = ifelse(name == "M", line, value)) %>%
        fill(No) %>%
        group_by(Query, No) %>%
        summarize(
            Hit.ID       = na.omit(Hit.ID) %>% first,
            Hit.Description = na.omit(Hit.Description) %>% first,
            Match        = na.omit(Match) %>% first,
            Q.ss_pred    = value[name == "Q" & grepl("^ss_pred ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = "") %>% gsub(" +", "", .),
            Q.query      = value[name == "Q" & grepl("^Consensus ", lead(value))] %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            Q.consensus  = value[name == "Q" & grepl("^Consensus ", value)]       %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.consensus  = value[name == "T" & grepl("^Consensus ", value)]       %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.hit        = value[name == "T" & grepl("^Consensus ", lag(value))]  %>% substr(., 16, nchar(.)) %>% paste(collapse = " "),
            T.ss_dssp    = value[name == "T" & grepl("^ss_dssp ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = " ") %>% gsub(" +", "", .),
            T.ss_pred    = value[name == "T" & grepl("^ss_pred ", value)]         %>% substr(., 16, nchar(.)) %>% paste(collapse = "")  %>% gsub(" ", "", .),
            .groups = "drop"
        ) %>%
        extract(Q.consensus, into = c("Q.Start", "Q.End", "Q.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
        extract(T.consensus, into = c("T.Start", "T.End", "T.Length"), regex = "^ *(\\d+) .+ (\\d+) +[(](\\d+)[)]$", remove = F, convert = T) %>%
        mutate(
            Q.consensus  = gsub("[0-9() ]+", "", Q.consensus),
            Q.query      = gsub("[0-9() ]+", "", Q.query),
            T.consensus  = gsub("[0-9() ]+", "", T.consensus),
            T.hit        = gsub("[0-9() ]+", "", T.hit),
        ) %>%
        #extract(Hit.Description, into = "Hit.Organism",    regex = "[{]([^}]+)[}]",  remove = F) %>%
        #extract(Hit.Description, into = "Hit.Description", regex = "([^;]+)",        remove = F) %>%
        #extract(Hit.Description, into = "Hit.Keywords",    regex = "[^;]+; ([^;]+)", remove = F) %>%
        mutate(Match = str_split(Match, " +")) %>%
        unnest(cols = Match) %>%
        separate(Match, into = c("key", "value"), "=") %>%
        mutate(value = sub("%", "", value) %>% as.numeric) %>%
        spread(key, value) %>%
        rename(E.value = `E-value`) %>%
        mutate(Aligned_cols = as.integer(Aligned_cols))
}

write.table(data, quote = F, sep = "\t", row.names = F)

R dplyr tidyr stringr From line 4 of helpers/parse_hhsuite.R

shell:
    "csvgrep -t -c Gene -r '^({params.regex})$' {input} | csvgrep -c clade -r '^({params.groups})$' | csvcut -c ID,Genome | csvformat -T -K1 > {output}"

SnakeMake From line 21 of workflow/Phylogeny_viruses.snakefile

shell:
    "cut -f1 {input.tsv} | xargs seqkit faidx {input.fas} | seqkit seq -o {output}"

SnakeMake seqkit From line 33 of workflow/Phylogeny_viruses.snakefile

shell:
    "mafft --localpair --maxiterate 1000 --thread {threads} {input} > {output}"

SnakeMake MAFFT API (EBI) From line 45 of workflow/Phylogeny_viruses.snakefile

shell:
    "trimal -in {input} -out {output} -gt {params.gt}"

SnakeMake trimAl From line 57 of workflow/Phylogeny_viruses.snakefile

shell:
    "iqtree2 -s {input} --prefix {params.prefix} -redo --alrt 1000 -B 1000 --seed {params.seed} -T {threads}"

SnakeMake From line 73 of workflow/Phylogeny_viruses.snakefile

script:
    "scripts/ggtree-viruses.R"

SnakeMake From line 90 of workflow/Phylogeny_viruses.snakefile

shell:
    "getorf -minsize {params.minsize} -maxsize {params.maxsize} -filter {input} > {output}"

SnakeMake From line 16 of workflow/Phylogeny_with_algae.snakefile

shell:
    "cdhit -i {input} -o {output} -c {params.c} -d 0"

SnakeMake From line 28 of workflow/Phylogeny_with_algae.snakefile

shell:
    "mafft {input} > {output}"

SnakeMake MAFFT API (EBI) From line 38 of workflow/Phylogeny_with_algae.snakefile

shell:
    "hmmbuild {output} {input}"

SnakeMake hmmbuild (genouest) From line 48 of workflow/Phylogeny_with_algae.snakefile

shell:
    "hmmsearch -o /dev/null --tblout {output} {input.hmm} {input.faa}"

SnakeMake hmmsearch (genouest) From line 59 of workflow/Phylogeny_with_algae.snakefile

shell:
    "hmmsearch -o /dev/null --tblout {output} {input.hmm} {input.faa}"

SnakeMake hmmsearch (genouest) From line 70 of workflow/Phylogeny_with_algae.snakefile

shell:
    "blastp -query {input.query} -db {input.faa} -outfmt '6 {params.headers}' -out {output}"

SnakeMake From line 84 of workflow/Phylogeny_with_algae.snakefile

shell:
    "blastp -query {input.query} -db {input.faa} -outfmt '6 {params.headers}' -out {output}"

SnakeMake From line 98 of workflow/Phylogeny_with_algae.snakefile

shell:
    "awk -ve={params.evalue} '!/^#/&&$5<e{{print$1}}' {input.tblout} | sort -u | xargs -r seqkit faidx -f {input.fasta} | seqkit replace -p '^([^ ]+)' -r '$1 {wildcards.genome}' -o {output}"

SnakeMake seqkit From line 112 of workflow/Phylogeny_with_algae.snakefile

shell:
    "awk -ve={params.evalue} '!/^#/&&$5<e{{print$1}}' {input.tblout} | sort -u | xargs -r seqkit faidx -f {input.fasta} | seqkit replace -p '^([^ ]+)' -r '$1 {wildcards.genome}' -o {output}"

SnakeMake seqkit From line 126 of workflow/Phylogeny_with_algae.snakefile

shell:
    "awk -ve={params.evalue} '$11<e' {input.blast} | cut -f2 | sort -u | xargs -r seqkit faidx -f {input.fasta} > {output}"

SnakeMake seqkit From line 140 of workflow/Phylogeny_with_algae.snakefile

shell:
    "awk -ve={params.evalue} '$11<e' {input.blast} | cut -f2 | sort -u | xargs -r seqkit faidx -f {input.fasta} > {output}"

SnakeMake seqkit From line 154 of workflow/Phylogeny_with_algae.snakefile

shell:
    "hmmsearch -o /dev/null --tblout {output} {input.hmm} {input.fasta}"

SnakeMake hmmsearch (genouest) From line 165 of workflow/Phylogeny_with_algae.snakefile

shell:
    "seqkit seq -gm{params.m} {input} | seqkit rmdup -o {output}"

SnakeMake seqkit From line 189 of workflow/Phylogeny_with_algae.snakefile

shell:
    "seqkit seq -gm{params.m} {input} | seqkit rmdup -o {output}"

SnakeMake seqkit From line 203 of workflow/Phylogeny_with_algae.snakefile

shell:
    "hmmalign --trim --outformat A2M {input.hmm} {input.fasta} > {output}"

SnakeMake hmmalign (genouest) From line 214 of workflow/Phylogeny_with_algae.snakefile

shell:
    "seqkit replace -sp [a-z] {input.a2m} | seqkit seq -nigm{params.m} | xargs seqkit faidx {input.a2m} | seqkit replace -sp [a-z] | seqkit rmdup -so {output}"

SnakeMake seqkit From line 227 of workflow/Phylogeny_with_algae.snakefile

shell:
    "seqkit replace -sp [a-z] {input.a2m} | seqkit seq -nigM{params.M} | xargs seqkit faidx {input.a2m} | seqkit replace -sp [a-z] | seqkit rmdup -so {output}"

SnakeMake seqkit From line 238 of workflow/Phylogeny_with_algae.snakefile

shell:
    "raxml-ng --redo --evaluate --msa {input.fasta} --tree {input.iqtree} --model {params.model} --seed {params.seed} --prefix {params.prefix} &> {log}"

SnakeMake raxml-ng From line 256 of workflow/Phylogeny_with_algae.snakefile

shell:
    "epa-ng --redo -s {input.fasta} -t {input.tree} --model {input.model} -q {input.short} -w {params.dir} &> {log}"

SnakeMake epang From line 273 of workflow/Phylogeny_with_algae.snakefile

shell:
    "gappa examine graft --jplace-path {input} --fully-resolve --out-dir {params.out_dir}"

SnakeMake GAPPA From line 285 of workflow/Phylogeny_with_algae.snakefile

shell:
    "iqtree2 -s {input} --prefix {params.prefix} -redo --alrt 1000 -B 1000 --seed {params.seed} -T {threads}"

SnakeMake From line 302 of workflow/Phylogeny_with_algae.snakefile

script:
    "scripts/ggtree.R"

SnakeMake From line 319 of workflow/Phylogeny_with_algae.snakefile

script:
    "scripts/ggtree.R"

SnakeMake From line 335 of workflow/Phylogeny_with_algae.snakefile

script:
    "scripts/ggtree.R"

SnakeMake From line 349 of workflow/Phylogeny_with_algae.snakefile

shell:
    "perl workflow/helpers/biotags.pl -i {input} -p CDS -t seq-{params.max_len} | awk '$1' | nl -w1 | seqkit tab2fx | seqkit seq -gm{params.min_len} -o {output}"

SnakeMake seqkit From line 20 of workflow/Promoters.snakefile

shell:
    "meme -minw {params.minw} -maxw {params.maxw} -nmotifs {params.nmotifs} -minsites {params.minsites} -maxsites Inf -dna -oc {params.outdir} {input}"

SnakeMake MEME From line 37 of workflow/Promoters.snakefile

shell:
    """
    awk -vm='{params.motif_re}' '$1=="MOTIF"{{s=$2!~m}}!s' {input} > {output}
    """

SnakeMake From line 47 of workflow/Promoters.snakefile

shell:
    "seqret -supper1 -filter -outseq {output} {input}"

SnakeMake seqret From line 59 of workflow/Promoters.snakefile

shell:
    "fimo --oc {params.out_dir} {input.meme} {input.fasta}"

SnakeMake FiMO From line 72 of workflow/Promoters.snakefile

shell:
    "fimo --oc {params.out_dir} {input.meme} {input.fasta}"

SnakeMake FiMO From line 85 of workflow/Promoters.snakefile

script:
    "scripts/fimo.R"

SnakeMake From line 98 of workflow/Promoters.snakefile

script:
    "scripts/meme.R"

SnakeMake From line 112 of workflow/Promoters.snakefile

shell:
    "python workflow/scripts/svg_stack.py --direction=v {input} > {output}"

SnakeMake From line 120 of workflow/Promoters.snakefile

shell:
    "seqkit seq -o {output} {input}"

SnakeMake seqkit From line 16 of workflow/Remote_homology.snakefile

shell:
    "ffdb fasta -d {output.data} -i {output.index} {input}"

SnakeMake From line 27 of workflow/Remote_homology.snakefile

shell:
    "ffdb fasta -d {output.data} -i {output.index} {input}"

SnakeMake From line 38 of workflow/Remote_homology.snakefile

shell:
    "cut -f2,3 {input} | nl -w1 > {output}"

SnakeMake From line 46 of workflow/Remote_homology.snakefile

shell:
    "ffindex_apply -q -d {output.data} -i {output.index} {input.data} {input.index} -- hhblits -cpu 1 -v 0 -b 1 -z 1 -d {params.db} -i stdin -oa3m stdout -e {params.e} -n {params.n}"

SnakeMake HHblits From line 64 of workflow/Remote_homology.snakefile

shell:
    "ffindex_apply -q -d {output.data} -i {output.index} {input.data} {input.index} -- $CONDA_PREFIX/scripts/addss.pl -v 0 /dev/stdin /dev/stdout"

SnakeMake From line 76 of workflow/Remote_homology.snakefile

shell:
    "ffindex_apply -q -d {output.data} -i {output.index} {input.data} {input.index} -- hhsearch -v 1 -cpu 1 -b 1 -z 1 -i stdin -d {params.db} -o stdout -p {params.p} -Z {params.BZ} -B {params.BZ} -ssm {params.ssm} -contxt {input.contxt}"

SnakeMake From line 96 of workflow/Remote_homology.snakefile

shell:
    "ffindex_apply -q -d {output.data} -i {output.index} {input.data} {input.index} -- hhsearch -v 1 -cpu 1 -b 1 -z 1 -i stdin -d {params.db} -o stdout -p {params.p} -Z {params.BZ} -B {params.BZ} -ssm {params.ssm} -contxt {input.contxt}"

SnakeMake From line 115 of workflow/Remote_homology.snakefile

shell:
    "ffindex_apply -q {input.data} {input.index} -- Rscript workflow/helpers/parse_hhsuite.R | sed '2,${{/^Query/d}}' > {output}"

SnakeMake From line 126 of workflow/Remote_homology.snakefile

shell:
    "mmseqs createdb {input} {output}"

SnakeMake MMseqs From line 136 of workflow/Remote_homology.snakefile

shell:
    "mmseqs createdb {input} {output}"

SnakeMake MMseqs From line 146 of workflow/Remote_homology.snakefile

shell:
    "mmseqs cluster --min-seq-id {params.min_seq_id} -c {params.coverage} --threads {threads} {input} {params.output_prefix} {params.tmp}"

SnakeMake MMseqs From line 165 of workflow/Remote_homology.snakefile

shell:
    "mmseqs result2msa {input.db} {input.db} {params.input_prefix} {params.output_prefix} --msa-format-mode 1"

SnakeMake MMseqs From line 179 of workflow/Remote_homology.snakefile

shell:
    "mpirun -np {threads} cstranslate_mpi -i {params.input_prefix} -o {params.output_prefix} -A {input.cs219_lib} -D {input.context} -x 0.3 -c 4 -I ca3m -b"

SnakeMake From line 199 of workflow/Remote_homology.snakefile

shell:
    "mmseqs createtsv {input.db} {input.db} {params.clu_prefix} {output}"

SnakeMake MMseqs From line 212 of workflow/Remote_homology.snakefile

script:
    "scripts/abc_graph.R"

SnakeMake From line 231 of workflow/Remote_homology.snakefile

shell:
    "mcl {input} -o {output} --abc -scheme {params.scheme} -te {threads} -I {params.I}"

SnakeMake MCL From line 246 of workflow/Remote_homology.snakefile

script:
    "scripts/mcl_graph.R"

SnakeMake From line 276 of workflow/Remote_homology.snakefile

library(dplyr)
library(tidyr)

with(snakemake@input, {
    clu_tsv_file      <<- clu_tsv
    segment_tsv_files <<- segment_tsv
    virus_tsv_files   <<- virus_tsv
})
with(snakemake@params, {
    coverage <<- coverage
    probab   <<- probab
    coverage_q_frag <<- coverage_q_frag
    coverage_t_frag <<- coverage_t_frag
    identities_frag <<- identities_frag
    probab_frag     <<- probab_frag
})
output_file <- unlist(snakemake@output)

clusters <- read.table(clu_tsv_file, sep = "\t", col.names = c("Cluster", "ID"))

segment_tsv <- lapply(segment_tsv_files, read.table, header = T, sep = "\t", quote = "")
virus_tsv   <- lapply(virus_tsv_files, read.table, header = T, sep = "\t", quote = "")
data <- c(segment_tsv, virus_tsv) %>%
    bind_rows %>%
    mutate(Q.Coverage = (Q.End - Q.Start + 1) / Q.Length * 100, T.Coverage = (T.End - T.Start + 1) / T.Length * 100) %>%
    # filter(Probab >= probab, Q.Coverage >= coverage, T.Coverage >= coverage) %>%
    filter(Probab >= probab, Q.Coverage >= coverage & T.Coverage >= coverage | Q.Coverage >= coverage_q_frag & T.Coverage >= coverage_t_frag & Identities >= identities_frag & Probab >= probab_frag) %>%
    left_join(clusters, by = c(Hit.ID = "Cluster")) %>%
    select(Query, ID, Probab)
write.table(data, output_file, sep = "\t", row.names = F, col.names = F, quote = F)

R dplyr tidyr From line 2 of scripts/abc_graph.R

import re
from Bio import SeqIO

input_gbk = str(snakemake.input)
output_gbk = str(snakemake.output)
flank = snakemake.params['flank']
segment = snakemake.params['segment']

coords_re = re.compile(r'(.+)_(\d+)-(\d+)[. ]*$')
def get_coords(description):
    search = coords_re.search(description)
    assert search, "Unexpected sequence name: %s" % description
    scaffold, start, end = search.groups()
    start = int(start)
    end = int(end) + flank
    if start > flank:
        start = start - flank
    else:
        start = 1
    return scaffold, start, end

found = False
with open(input_gbk) as fd:
    records = SeqIO.parse(fd, 'genbank')
    for record in records:
        scaffold, start, end = get_coords(record.description)
        if scaffold == segment['scaffold'] and start <= segment['scaffold_start'] and end >= segment['scaffold_end']:
            found = True
            break
assert found, "Segment %s not found" % segment['segment_id']

segment_start = segment['scaffold_start'] - start
segment_end   = segment['scaffold_end'] - start + 1

sub_record = record[segment_start:segment_end]
if segment['strand'] < 0:
    sub_record = sub_record.reverse_complement()

sub_record.name = segment['name']
sub_record.id = "%s_%d-%d" % (segment['scaffold'], segment['scaffold_start'], segment['scaffold_end'])
sub_record.description = sub_record.id

sub_record.annotations["molecule_type"] = "DNA"
SeqIO.write(sub_record, output_gbk, "genbank")

Python Biopython From line 2 of scripts/extract_element.py

library(dplyr)

input_file <- unlist(snakemake@input)
output_file <- unlist(snakemake@output)
with(snakemake@params, {
    motif_re <<- motif_re
    q_thresh <<- q_thresh
})
gff <- read.table(input_file, header = T) %>%
    filter(grepl(motif_re, matched_sequence), q.value < q_thresh) %>%
    mutate(description = sprintf("Name=%s;Alias=%s;pvalue=%f;qvalue=%f;sequence=%s;", motif_id, motif_alt_id, p.value, q.value, matched_sequence)) %>%
    mutate(source = "fimo", type = "nucleotide_motif", frame = ".") %>%
    select(sequence_name, source, type, start, stop, score, strand, frame, description)
write.table(gff, file = output_file, sep = "\t", row.names = F, col.names = F, quote = F)

R dplyr FiMO From line 1 of scripts/fimo.R

from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
from collections import defaultdict

ref_annots = []

gff_file = str(snakemake.input["gff"])
fna_file  = str(snakemake.input["fna"])
blastn_file = str(snakemake.input["blastn"])
pfam_file = str(snakemake.input["pfam"])
hmm_dbs = snakemake.input["hmm_dbs"]
hmmscan_files = snakemake.input["hmmscan"]
markers_files = snakemake.input["markers"]

output_file = str(snakemake.output)

evalue_threshold = snakemake.params["evalue"]
coding_feature = snakemake.params["coding_feature"]

min_repeat_id = snakemake.params["min_repeat_id"]
min_repeat_gap = snakemake.params["min_repeat_gap"]
min_repeat_len = snakemake.params["min_repeat_len"]

aliases = {
    "MCP": "Major capsid protein (MCP)",
    "mCP": "Minor capsid protein (mCP)",
    "uncharacterized protein": "-",
    "DNA polymerase - helicase": "DNA polymerase-helicase"
}

features = defaultdict(list)

with open(blastn_file) as fh:
    for line in fh:
        qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore = line.rstrip().split()
        qstart = int(qstart)
        qend   = int(qend)
        sstart = int(sstart)
        send   = int(send)
        if float(pident) > min_repeat_id and qseqid == sseqid and int(length) >= min_repeat_len and qstart < qend and sstart > send and sstart - qend >= min_repeat_gap:
            loc1 = FeatureLocation(qstart - 1, qend, strand = 1)
            loc2 = FeatureLocation(send - 1, sstart, strand = -1)
            feature = SeqFeature(CompoundLocation([loc1, loc2]), type = 'repeat_region')
            feature.qualifiers['rpt_type'] = 'inverted'
            feature.qualifiers['inference'] = 'ab initio prediction:blastn'
            feature.qualifiers['note'] = 'pident: %s%%' % pident
            features[qseqid].append(feature)

descs = {}
for fname in hmm_dbs:
    with open(fname) as fh:
        NAME = ACC = None
        for line in fh:
            if line.startswith('ACC'):
                key, ACC = line.rstrip().split()
            if line.startswith('NAME'):
                key, NAME = line.rstrip().split()
            if line.startswith('DESC'):
                key, DESC = line.rstrip().split(maxsplit = 1)
                if ACC: descs[ACC] = DESC
                if NAME: descs[NAME] = DESC
                NAME = ACC = None

hits = defaultdict(list)

with open(pfam_file) as fh:
    for line in fh:
        if not line.startswith('#') and line != '\n':
            seq_id, aln_start, aln_end, env_start, env_end, hmm_acc, hmm_name, type, hmm_start, hmm_end, hmm_len, bit_score, E_value, significance, clan = line.rstrip().split()
            hit = "%s: %s (%s)" % (hmm_acc, descs[hmm_acc], E_value)
            hits[seq_id].append(hit)

for fname in markers_files:
    with open(fname) as fh:
        for line in fh:
            if not line.startswith('#'):
                t_name, t_acc, q_name, q_acc, E_value, score, bias, best_E_value, best_score, best_bias, exp, reg, clu, ov, env, dom, rep, inc, desc = line.rstrip().split(maxsplit = 18)
                if float(E_value) <= evalue_threshold:
                    hit = "Marker %s: %s (%s)" % (q_name, descs[q_name], E_value)
                    hits[t_name].append(hit)

for fname in hmmscan_files:
    with open(fname) as fh:
        for line in fh:
            if not line.startswith('#'):
                t_name, t_acc, q_name, q_acc, E_value, score, bias, best_E_value, best_score, best_bias, exp, reg, clu, ov, env, dom, rep, inc, desc = line.rstrip().split(maxsplit = 18)
                if desc in aliases:
                    desc = aliases[desc]
                definition = ''
                if t_name != desc and desc != '-':
                    definition = ' ' + desc if t_name != desc and desc != '-' else ''
                if float(E_value) <= evalue_threshold:
                    hit = "%s:%s (%s)" % (t_name, definition, E_value)
                    hits[q_name].append(hit)

with open(gff_file) as fh:
    for line in fh:
        if not line.startswith('#') and line != '\n':
            seqname, source, feature_type, start, end, score, strand, frame, attributes = line.rstrip().split('\t')
            if feature_type == coding_feature:
                atts = {}
                for att in attributes.split(';'):
                    key, value = att.split('=')
                    atts[key] = value
                if 'Parent' in atts:
                    parent = atts['Parent']
                else:
                    parent = atts['ID']
                loc = FeatureLocation(int(start) - 1, int(end))
                feature = SeqFeature(loc, type = 'CDS', strand = int(strand + '1'))
                feature.qualifiers['locus_tag'] = parent
                feature.qualifiers['inference'] = "ab initio prediction:" + source.replace("_v", ":")
                if parent in hits:
                    feature.qualifiers['note'] = hits[parent]
                features[seqname].append(feature)

with open(output_file, 'w') as out_fh:
    with open(fna_file) as in_fh:
        rec_num = 1
        for record in SeqIO.parse(in_fh, 'fasta'):
            record.annotations['molecule_type'] = 'DNA'
            record.name = "locus_%d" % rec_num
            for feature in features[record.id]:
                if feature.type == 'CDS':
                    transl = feature.translate(record, cds = False)
                    feature.qualifiers['translation'] = transl.seq.rstrip('*')
                record.features.append(feature)
            SeqIO.write(record, out_fh, 'genbank')
            rec_num += 1

Python Biopython BLAST hmmscan (EBI) From line 2 of scripts/gff2gbk.py

library(dplyr)
library(tidyr)
library(ape)
library(ggtree)
library(treeio)
library(phangorn)
library(stringr)
library(ggplot2)
library(phytools)

if (interactive()) {
    setClass("snake", slots = list(input = "list", output = "list"))
    snakemake <- new("snake", input  = list(
            tree = "analysis/phylogeny/MCP_NCLDV_epa/epa_result.newick",
            fasta = "analysis/phylogeny/MCP_NCLDV.fasta",
            outgroups    = "metadata/queries/MCP_NCLDV_outgroups.faa",
            synonyms = "metadata/organisms.txt",
            hmm = Sys.glob("hmm_algae/*.hmm")
    ), output = list(
        image = "test.svg",
        jtree = "output/MCP_NCLDV.jtree"
    ))
}

with(snakemake@input, {
    tree_file     <<- tree
    fasta_file    <<- fasta
    synonyms_file <<- synonyms
    outgroup_file <<- outgroups
})
with(snakemake@output, {
    out_image_file <<- image
    out_jtree_file <<- jtree
})
with(snakemake@params, {
    outgroup_rooting <<- outgroup_rooting
})

read.fasta.headers <- function(fnames) {
    file.info(fnames) %>%
        filter(size > 0) %>%
        rownames %>%
        lapply(treeio::read.fasta) %>%
        lapply(names) %>%
        unlist %>%
        data.frame(title = .)
}

synonyms <- read.table(synonyms_file, header = T, sep = "\t", fill = T, na.strings = "") %>%
    mutate(Collapse = ifelse(is.na(Collapse), Name, Collapse))

headers <- read.fasta.headers(fasta_file) %>%
    extract(title, into = c("label", "ID"), regex = "^([^ ]+) ([^ ]+)", remove = F) %>%
    left_join(synonyms, by = "ID")

no_name <- filter(headers, is.na(Name)) %>%
    pull(label) %>%
    paste(collapse = ", ")
if (no_name != "") {
    print(paste("No aliases found for: ", no_name))
    quit(status = 1)
}

tree <- read.tree(tree_file)
tree <- phangorn::midpoint(tree, node.labels = "support")
if (outgroup_rooting) {
    outgroup_df <- read.fasta.headers(outgroup_file)
    outgroups <- with(outgroup_df, sub(" .*", "", title))
    tree <- ape::root(tree, node = MRCA(tree, outgroups), edgelabel = T, resolve.root = T)
}
tree <- as_tibble(tree) %>%
    mutate(support = ifelse(node %in% parent & label != "", label, NA)) %>%
    separate(support, into = c("SH_aLRT", "UFboot"), sep = "/", convert = T) %>%
    left_join(headers, by = "label") %>%
    mutate(label.show = Name) %>%
    mutate(isInternal = node %in% parent) %>%
    `class<-`(c("tbl_tree", "tbl_df", "tbl", "data.frame"))
tree_data <- as.treedata(tree)
write.jtree(tree_data, file = out_jtree_file)

ntaxa <- filter(tree, ! node %in% parent) %>% nrow

colors <- list(
    Haptophyta = "orange",
    Chlorophyta = "green",
    Streptophyta = "darkgreen",
    MAG = "purple",
    Stramenopiles = "brown",
    Cryptophyta = "red",
    Amoebozoa = "gold4",
    Euglenozoa = "yellow",
    Choanoflagellata = "darkslateblue",
    Glaucophyta = "cyan",
    Animals = "blue",
    Dinoflagellata = "gray50",
    Rhizaria = "gray30"
)

scaleClades <- function(p, df) {
    with(df, Reduce(function(.p, .node) {
        offs <- offspring(.p$data, .node)
        scale <- 0.5 / (nrow(offs) - 1)
        scaleClade(.p, .node, scale)
    }, node, p))
}
collapseClades <- function(p, df) {
    with(df, Reduce(function(.p, .node) {
        fill <- unlist(colors[Host[node == .node]])
        .p$data[.p$data$node == .node, "label.show"] <- label.show[node == .node]
        collapse(.p, .node, "mixed", fill = fill)
    }, node, p))
}
#labelClades <- function(p) {
#    with(df, Reduce(function(.p, .node) {
#        .p + geom_cladelab(node = .node, label = label[node == .node], align = T, offset = .2, textcolor = 'blue')
#    }, node, p))
#}

multi_species <- allDescendants(tree_data@phylo) %>%
    lapply(function(x) filter(tree, node %in% x)) %>%
    bind_rows(.id = "ancestor") %>%
    group_by(ancestor) %>%
    filter(n_distinct(Collapse, na.rm = T) == 1, sum(!isInternal) > 1) %>% # , !any(Group == "Haptophyta")) %>%
    ungroup %>%
    mutate(ancestor = as.numeric(ancestor)) %>%
    filter(! ancestor %in% node) %>%
    filter(!is.na(Collapse)) %>%
    group_by(ancestor, Collapse) %>%
    summarize(num_tips = sum(!isInternal), Host = first(na.omit(Host))) %>%
    mutate(label.show = sprintf("%s (%d)", Collapse, num_tips)) %>%
    rename(node = ancestor)
p <- ggtree(tree_data) +
    geom_nodepoint(aes(x = branch, subset = !is.na(UFboot) & UFboot >= 90, size = UFboot)) +
    geom_tiplab(aes(label = label.show), size = 4, align = T, linesize = 0) +
    geom_text2(aes(subset = node %in% multi_species$node, x = max(x, na.rm = T), label = label.show), nudge_x = 0.01, size = 4, hjust = 0) +
    geom_tippoint(aes(color = Host), size = 3) +
    geom_treescale(width = 0.5) +
    scale_size_continuous(limits = c(90, 100), range = c(1, 3)) +
    scale_shape_manual(values = seq(0,15)) +
    scale_color_manual(values = colors)

p <- scaleClades(p, multi_species)
p <- collapseClades(p, multi_species)
# p <- facet_plot(p, mapping = aes(x = as.numeric(as.factor(query.name)), shape = DESC), data = genes, geom = geom_point, panel = 'Genes')

ggsave(out_image_file, p, height = ntaxa * 0.1, width = 7, limitsize = F)

R Snakemake ggplot2 dplyr tidyr stringr APE ggtree treeio phangorn phytools From line 2 of scripts/ggtree.R

library(dplyr)
library(tidyr)
library(ape)
library(ggtree)
library(treeio)
library(phangorn)
library(stringr)
library(ggplot2)
library(phytools)

if (interactive()) {
    setClass("snake", slots = list(input = "list", output = "list"))
    snakemake <- new("snake", input  = list(
            tree = "analysis/phylogeny_viruses/MCP-small.treefile",
            tsv  = "analysis/phylogeny_viruses/MCP-small.tsv",
            virus_metadata = "metadata/viruses.tsv",
            segment_metadata = "metadata/viral_segments.tsv",
            colors = "metadata/subclade_colors.txt"
    ), output = list(
        image = "test.svg",
        jtree = "test.jtree"
    ))
}

with(snakemake@input, {
    tree_file <<- tree
    tsv_file  <<- tsv
    virus_metadata_file <<- virus_metadata
    segment_metadata_file <<- segment_metadata
    colors_file <<- colors
})
with(snakemake@output, {
    out_image_file <<- image
    out_jtree_file <<- jtree
})

colors <- read.table(colors_file, col.names = c("subclade", "color"), comment.char = "") %>%
    with(setNames(color, subclade))
viruses  <- read.table(virus_metadata_file, header = T, sep = "\t", na.strings = "")
segments <- read.table(segment_metadata_file, header = T, sep = "\t", na.strings = "")
metadata <- bind_rows(viruses, segments) %>%
    mutate(subclade = ifelse(is.na(subclade), clade, subclade))

orgs <- read.table(tsv_file, col.names = c("label", "Organism"), sep = "\t")

tree <- read.tree(tree_file) %>%
    phangorn::midpoint(node.labels = "support") %>%
    as_tibble %>%
    mutate(support = ifelse(node %in% parent & label != "", label, NA)) %>%
    separate(support, into = c("SH_aLRT", "UFboot"), sep = "/", convert = T) %>%
    left_join(orgs, by = "label") %>%
    left_join(metadata, by = c(Organism = "short")) %>%
    mutate(isInternal = node %in% parent) %>%
    `class<-`(c("tbl_tree", "tbl_df", "tbl", "data.frame"))
ntaxa <- filter(tree, ! node %in% parent) %>% nrow
tree_data <- as.treedata(tree)
write.jtree(tree_data, file = out_jtree_file)

p <- ggtree(tree_data) +
    geom_nodepoint(aes(x = branch, subset = !is.na(UFboot) & UFboot >= 90, size = UFboot)) +
    geom_tiplab(aes(label = Organism, color = subclade), size = 2, align = F, linesize = 0) +
    scale_color_manual(values = colors) +
    geom_treescale(width = 0.5) +
    scale_size_continuous(limits = c(90, 100), range = c(1, 2))

ggsave(out_image_file, p, height = ntaxa * 0.1, width = 4, limitsize = F)

R Snakemake ggplot2 dplyr tidyr stringr APE ggtree treeio phangorn phytools From line 2 of scripts/ggtree-viruses.R

from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
from pandas import read_csv

ref_annots = []

input_files = snakemake.input["gbk"]
data_file   = str(snakemake.input["data"])
output_file = str(snakemake.output)

data = read_csv(data_file, sep = "\t")

with open(output_file, 'w') as fd_out:
    for input_file in input_files:
        with open(input_file) as fd_in:
            for record in SeqIO.parse(fd_in, 'genbank'):
                for feature in record.features:
                    quals = feature.qualifiers
                    if feature.type == 'CDS' and 'locus_tag' in feature.qualifiers:
                        ID = feature.qualifiers['locus_tag'][0]
                        if ID:
                            row = data[data.ID == ID]
                            if 'note' not in feature.qualifiers:
                                feature.qualifiers['note'] = []
                            if not row.empty:
                                if row.Gene_Pfam.notnull().item():
                                    feature.qualifiers['note'].append('Gene by hhsearch Pfam match: %s' % row.Gene_Pfam.item())
                                    feature.qualifiers['product'] = row.Gene_Pfam.item()
                                if row.Gene_Cluster.notnull().item():
                                    feature.qualifiers['note'].append('Gene by cluster membership: %s' % row.Gene_Cluster.item())
                                    feature.qualifiers['product'] = row.Gene_Cluster.item()
                                if row['Hit.ID'].notnull().item() and row.Probab.item:
                                    feature.qualifiers['note'].append('hhsearch best hit against Pfam: %s %s (%s)' % (row['Hit.ID'].item(), row['Hit.Description'].item(), row.Probab.item()))
                SeqIO.write(record, fd_out, 'genbank')

Python Pandas Biopython From line 2 of scripts/mcl_add_to_gbk.py

library(dplyr)
library(tidyr)
library(scatterpie)
library(tibble)
library(treeio)
library(igraph)
library(ggraph)
library(ape)

if (interactive()) {
    mcl_file <- "analysis/hh_mcl/mcl.txt"
    virus_metadata_file   <- "metadata/viruses.tsv"
    segment_metadata_file <- "metadata/viral_segments.tsv"
    segment_ffindex_files <- Sys.glob("analysis/PLV_segments/*.ffindex")
    virus_ffindex_files   <- Sys.glob("analysis/PLVs/*.ffindex")
    segment_pfam_files    <- Sys.glob("analysis/PLV_segments_hh/*-hhsearch-pfam.tsv")
    virus_pfam_files      <- Sys.glob("analysis/PLVs_hh/*-hhsearch-pfam.tsv")
    genes_cluster_file <- "metadata/genes_cluster.tsv"
    genes_pfam_file <- "metadata/genes_pfam.tsv"
    probab_threshold <- 80
    core_genes_file <- "pie.pdf"
    jtree_file      <- "output/MCP_PLV.jtree"
    reduced_tree_file <- "output/MCP_PLV_reduced.svg"
    chosen_clades <- c("Gezel", "Dwarf", "Mesomimi")
    clade_levels <- c("Mesomimi", "Dwarf", "Virophage", "TVS", "Gezel")
    ref_genomes <- c("Sputnik", "Mavirus_Spezl", "TVV_S1")
    colors_file <- "metadata/subclade_colors.txt"
    plv_order_file <- "metadata/plv_order.txt"
    family_colors_file <- "metadata/family_colors.txt"
} else {
    with(snakemake@input, {
        mcl_file              <<- mcl
        virus_metadata_file   <<- virus_metadata
        segment_metadata_file <<- segment_metadata
        segment_pfam_files    <<- segment_pfam
        virus_pfam_files      <<- virus_pfam
        segment_ffindex_files <<- segment_ffindex
        virus_ffindex_files   <<- virus_ffindex
        genes_cluster_file    <<- genes_cluster
        genes_pfam_file       <<- genes_pfam
        jtree_file            <<- jtree
        colors_file           <<- colors
        plv_order_file        <<- plv_order
        family_colors_file    <<- family_colors
    })
    with(snakemake@params, {
        probab_threshold  <<- probab
        chosen_clades     <<- chosen_clades
        clade_levels      <<- clade_levels
        ref_genomes       <<- ref_genomes
    })
    with(snakemake@output, {
        data_file       <<- data
        core_genes_file <<- core_genes
        bipart_file     <<- bipartite
        reduced_tree_file <<- reduced_tree
    })
}
segments <- basename(segment_ffindex_files) %>% sub(".ffindex", "", .)
viruses  <- basename(virus_ffindex_files)   %>% sub(".ffindex", "", .)

genes_pfam <- read.table(genes_pfam_file, header = T, sep = "\t", fill = T, na.strings = "") %>%
    mutate(Comb = 1:n()) %>%
    separate_rows(Pfam, sep = ",") %>%
    group_by(Comb) %>%
    mutate(Comb.Num = n())
genes_cluster <- read.table(genes_cluster_file, header = T, sep = "\t")

metadata <- bind_rows(
    read.table(virus_metadata_file, sep = "\t", header = T),
    read.table(segment_metadata_file, sep = "\t", header = T)
) %>% `rownames<-`(.$short)
clusters <- readLines(mcl_file) %>%
    data.frame(ID = ., Cluster = 1:length(.)) %>%
    separate_rows(ID, sep = "\t") %>%
    left_join(genes_cluster, by = "ID") %>%
    group_by(Cluster) %>%
    mutate(Gene_Cluster = first(na.omit(Gene_Cluster)))

segment_pfam <- lapply(segment_pfam_files, read.table, header = T, sep = "\t", quote = "")
virus_pfam   <- lapply(virus_pfam_files, read.table, header = T, sep = "\t", quote = "")
pfam_all <- c(segment_pfam, virus_pfam) %>%
    bind_rows %>%
    select(-c(Q.query, T.ss_pred, T.hit, T.consensus, Q.consensus, T.ss_dssp, Q.ss_pred)) %>%
    extract(Hit.Description, into = "Hit.Name", regex = "; (.+?) ;", remove = F)
pfam <- filter(pfam_all, Probab >= probab_threshold)

to_gff <- function(.data) {
    mutate(.data, Source = "hhsearch", Feature = "domain", Strand = ".", Fname = ".", Attributes = paste(Hit.ID, Hit.Description)) %>%
        select(ID, Source, Feature, Q.Start, Q.End, Probab, Strand, Fname, Attributes)
}
most_freq <- function(.data) {
    na.omit(.data) %>%
        table(useNA = "always") %>%
        which.max %>%
        names
}
first_nona <- function(.data) {
    first(na.omit(.data))
}

segment_genes <- lapply(segment_ffindex_files, read.table, sep = "\t") %>%
    setNames(segments)
virus_genes   <- lapply(virus_ffindex_files, read.table, sep = "\t") %>%
    setNames(viruses)
data <- c(segment_genes, virus_genes) %>%
    bind_rows(.id = "Genome") %>%
    select(Genome, ID = 2) %>%
    left_join(pfam, by = c(ID = "Query")) %>%
    left_join(metadata, by = c(Genome = "short")) %>%
    left_join(genes_pfam, by = c(Hit.Name = "Pfam")) %>%
    group_by(Genome, ID, Comb) %>%
    mutate(Gene_Pfam = ifelse(n_distinct(Hit.ID) >= Comb.Num, Gene_Pfam, NA), Gene_Pfam = ifelse(is.na(No_threshold) | No <= No_threshold, Gene_Pfam, NA)) %>%
    left_join(clusters, by = "ID") %>%
    mutate(Cluster = as.character(ifelse(is.na(Cluster), ID, Cluster))) %>%
    ungroup %>%
    arrange(No, -Comb.Num) %>%
    group_by(Genome, ID, Cluster, clade, subclade) %>%
    summarize(Gene_Pfam = first_nona(Gene_Pfam), Gene_Cluster = first_nona(Gene_Cluster), Hit.ID = first(Hit.ID), Hit.Description = first(Hit.Description), Probab = first(Probab)) %>%
    group_by(Cluster) %>%
    mutate(Gene_Pfam = most_freq(Gene_Pfam), Gene_Cluster = most_freq(Gene_Cluster)) %>%
    mutate(Gene = ifelse(is.na(Gene_Cluster), Gene_Pfam, Gene_Cluster), Gene = ifelse(is.na(Gene), paste0("Cluster_", Cluster), Gene), Gene = ifelse(is.na(Gene), paste0("ID_", ID), Gene)) %>%
    ungroup
write.table(data, data_file, sep = "\t", quote = F, row.names = F)

virus_segments <- read.table(segment_metadata_file, sep = "\t", header = T) %>%
    mutate(scaffold = sub("Isogal_", "", scaffold)) # NB: workaround for Isogal scaffolds. TODO: do a proper fix
tree <- read.jtree(jtree_file)
tip.data <- filter(tree@data, !isInternal) %>%
    mutate(label = sub(" .*", "", title)) %>%
    extract(title, into = c("scaffold", "gene_num", "genome", "start", "end"), regex = "^(.+?)_(\\d+) (.+?) \\[(\\d+) - (\\d+)\\]", remove = F, convert = T) %>%
    left_join(virus_segments, by = c("scaffold")) %>%
    filter(is.na(scaffold_start) | is.na(start) | start >= scaffold_start & end <= scaffold_end) %>%
    mutate(Name = ifelse(is.na(short), Name, short)) %>%
    {setNames(.$Name, .$label)}
tree@phylo$tip.label <- tip.data[tree@phylo$tip.label]
tips.to.keep <- filter(data, clade == "Gezel") %>%
    pull(Genome) %>%
    `[`(. %in% tree@phylo$tip.label)
tips.to.drop <- tree@phylo$tip.label[! tree@phylo$tip.label %in% tips.to.keep]
my.tree <- tree@phylo %>%
    keep.tip(tips.to.keep) %>%
    drop.tip(tips.to.drop) %>%
    ladderize %>%
    chronos(lambda = 0.1)
svg(reduced_tree_file)
plot(my.tree)
dev.off()

#order.tips <- data.frame(tip.num = my.tree$edge[,2]) %>%
#    filter(tip.num <= length(my.tree$tip.label)) %>%
#    mutate(Genome = my.tree$tip.label[tip.num], num = n():1)

order.tips <- read.table(plv_order_file, col.names = "Genome") %>%
    mutate(num = n():1)

clades.data <- filter(data, clade %in% chosen_clades | Genome %in% ref_genomes) %>%
    mutate(Count = 1) %>%
    complete(Gene, nesting(Genome, clade, subclade), fill = list(Count = 0)) %>%
    group_by(Gene) %>%
    mutate(N_Subclades = n_distinct(subclade[clade == "Gezel" & Count > 0]), N_Genomes = n_distinct(Genome[clade == "Gezel" & Count > 0])) %>%
    filter(N_Subclades > 2) %>%
    mutate(clade = factor(clade, levels = clade_levels)) %>%
    left_join(order.tips, by = "Genome") %>%
    replace_na(list(num = 0)) %>%
    arrange(-N_Genomes, num, Gene, clade, desc(subclade), Genome) %>%
    ungroup %>%
    mutate(Gene = factor(Gene, levels = unique(Gene)), Genome = factor(Genome, levels = unique(Genome))) %>%
    filter(Count > 0)
family_colors <- read.table(family_colors_file, sep = "\t", col.names = c("Gene", "color"), comment.char = "") %>%
    right_join(clades.data, by = "Gene") %>%
    group_by(Gene, Cluster, color) %>%
    filter(!is.na(color)) %>% 
    summarize(n = n(), .groups = "drop") %>%
    group_by(Gene) %>%
    arrange(n) %>%
    mutate(a = seq(0.6, 0, len = n())) %>%
    mutate(r = col2rgb(color)[1], g = col2rgb(color)[2], b = col2rgb(color)[3]) %>%
    mutate(r = (255 - r) * a + r, g = (255 - g) * a + g, b = (255 - b) * a + b) %>%
    mutate(color = rgb(r / 255, g / 255, b / 255)) %>%
    with(setNames(color, Cluster))
ggplot_scatterpie <- function(.data, x.col, y.col, z.col, val.col, group.col) {
    x_uniq <- levels(.data[[x.col]])
    y_uniq <- levels(.data[[y.col]])
    .data <- ungroup(.data) %>%
        mutate(x = as.numeric(.[[x.col]]), y = as.numeric(.[[y.col]])) %>%
        rename(value = !!val.col, z = !!z.col)
    ggplot() +
        geom_scatterpie(aes(x = x, y = y), data = .data, long_format = T, cols = "z", color = NA) +
        scale_x_continuous(breaks = 1:length(x_uniq), labels = x_uniq) +
        scale_y_continuous(breaks = 1:length(y_uniq), labels = y_uniq) +
        xlab(x.col) + ylab(y.col) #+
        #facet_grid(rows = group.col, scales = "free_y", space = "free_y")
}

p <- ggplot_scatterpie(clades.data, "Gene", "Genome", "Cluster", "Count", "subclade") +
    scale_fill_manual(values = family_colors, na.value = "black") +
    coord_equal() +
    theme_void() +
    theme(axis.title.x = element_blank(), axis.title.y = element_blank(), legend.position = "none", axis.text.x = element_text(angle = 90, vjust = 1, hjust = 1), axis.text.y = element_text())
ggsave(core_genes_file, p, height = 12)

vertex_metadata <- list(
        Cluster = distinct(data, Cluster, Gene) %>%
            mutate(label = ifelse(grepl("Cluster", Gene), NA, as.character(Gene)), subclade = "Cluster") %>%
            column_to_rownames("Cluster"),
        Genome = mutate(metadata, label = short)
    ) %>% bind_rows(.id = "Type")

g <- filter(data, clade == "Gezel") %>%
    distinct(Genome, Cluster) %>%
    group_by(Cluster) %>%
    filter(n() > 2) %>%
    mutate(present = T) %>%
    spread(Genome, present, F) %>%
    column_to_rownames("Cluster") %>%
    as.matrix %>%
    graph.incidence %>%
    set_vertex_attr("subclade", value = vertex_metadata[match(V(.)$name, rownames(vertex_metadata)),"subclade"]) %>%
    set_vertex_attr("label",    value = vertex_metadata[match(V(.)$name, rownames(vertex_metadata)),"label"]) %>%
    set_vertex_attr("gene",     value = vertex_metadata[match(V(.)$name, rownames(vertex_metadata)),"Gene"]) %>%
    set_vertex_attr("node_type", value = vertex_metadata[match(V(.)$name, rownames(vertex_metadata)),"Type"])

colors <- read.table(colors_file, comment.char = "") %>%
    with(setNames(V2,V1)) %>%
    c(Cluster = "black", "darkgray")

set.seed(1234)
p <- ggraph(g, "fr") +
    geom_edge_link(alpha = .1) +
    geom_node_point(aes(shape = node_type, colour = subclade, size = type)) +
    geom_node_text(aes(filter = !type, label = label), size = 1.5, color = "red",   repel = T, nudge_y = -0.01, hjust = "right") +
    geom_node_text(aes(filter = type,  label = label), size = 2,   color = "black", repel = T, nudge_y = -0.01, hjust = "right") +
    scale_size_manual(values = c(1,2)) +
    scale_color_manual(values = colors) +
    theme_graph(base_family = "sans")
ggsave(bipart_file, p, width = 10, height = 8)

R ggplot2 dplyr tidyr tibble APE igraph treeio MCL ggraph scatterpie From line 2 of scripts/mcl_graph.R

suppressPackageStartupMessages(library(dplyr))
library(XML)
library(ggplot2)
library(ggseqlogo)
library(stringr)
library(tidyr)
suppressPackageStartupMessages(library(cowplot))

input_file <- unlist(snakemake@input)
output_file <- unlist(snakemake@output)
with(snakemake@params, {
    motif_res <<- data.frame(motif_name = motif_res, regex = motif_res)
    meme_name <<- meme_name
    max_len   <<- max_len
})
data <- xmlToList(xmlParse(input_file))
sequences <- data$training_set %>%
    `[`(names(.) == "sequence") %>%
    bind_rows %>%
    mutate(length = as.numeric(length))

motif.data <- data$motifs %>%
    `[`(names(.) == "motif") %>%
    setNames(sapply(., function(x) x$.attrs["id"]))

motifs <- motif.data %>%
    lapply(function(x) as.data.frame(t(x$.attrs))) %>%
    bind_rows %>%
    mutate_at(vars(-id, -name, -alt), as.numeric) %>%
    mutate(sites_r = sites / nrow(sequences))
motifs.filtered <- left_join(motifs, motif_res, by = character()) %>%
    filter(str_match(name, regex) > 0) %>%
    distinct(motif_name, .keep_all = T)

site.attrs <- function(site) {
    attrs <- as.data.frame(t(site$.attrs))
    attrs$left_flank  <- site$left_flank
    attrs$site        <- paste(site$site[names(site$site) == "letter_ref"], collapse = "")
    attrs$right_flank <- site$right_flank
    return(attrs)
}
fetch.sites <- function(sites) {
    sites <- sites$contributing_sites
    sites[names(sites) == "contributing_site"] %>% lapply(site.attrs) %>% bind_rows
}

sites <- lapply(motif.data, fetch.sites) %>%
    bind_rows(.id = "id") %>%
    filter(id %in% motifs.filtered$id) %>%
    left_join(sequences, by = c(sequence_id = "id")) %>%
    mutate(position = as.numeric(position) - length, pvalue = as.numeric(pvalue))

plot.motif <- function(motif, .y) {
    motif.sites <- sites %>% filter(id == .y$id)
    label <- with(motif, paste(
        gsub("_", " ", meme_name),
        name,
        paste("Sites =", sites, paste0("(", round(sites_r * 100, 1), "%)")),
        # paste("IC =", round(ic, 3)),
        paste("E-value =", e_value),
        sep = "\n"
    ))
    g1 <- ggplot() + annotate("text", label = label, x = 1, y = 1) +
        theme_bw() + theme(axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
    g2 <- motif.sites %>% pull(site) %>% ggseqlogo(seq_type = "dna") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
    g3 <- ggplot(motif.sites, aes(x = position)) + geom_histogram(binwidth=10) + xlim(-150, 0) +
        theme_bw() +
        theme(axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title = element_blank())
    g <- plot_grid(g1, g2, g3, nrow = 1, align = 'h')
    return(g)
}

if (nrow(motifs.filtered) == 0) {
    g <- ggplot() + theme_void()
    ggsave(output_file, g, width = 1, height = 1)
} else {
    g.list <- motifs.filtered %>%
        group_by(id) %>%
        group_map(plot.motif)
    ggsave(output_file, plot_grid(plotlist = g.list, ncol = 1), width = 7, height = length(g.list) * 1.4)
}

R ggplot2 dplyr tidyr stringr cowplot ggseqlogo XML From line 1 of scripts/meme.R

library(dplyr)
library(tidyr)
library(igraph)
library(ggplot2)
# library(gggenes)

input <- snakemake@input
output <- snakemake@output
params <- snakemake@params

hmmsearch_files <- unlist(input)

# figure_file <- unlist(output["figure"])
bed_file <- unlist(output["bed"])

E.value.threshold <- unlist(params["e_value_threshold"]) # 0.001
distance.threshold <- unlist(params["distance_threshold"]) # 200000
genes.threshold <- unlist(params["genes_threshold"]) # 2

read.hmmer.tblout <- function(fname) {
	col.names <- c("target.name", "target.accession", "query.name", "query.accession", "full.sequence.E.value", "full.sequence.score", "full.sequence.bias", "best.1.domain.E.value", "best.1.domain.score", "best.1.domain.bias", "domain.number.estimation.exp", "domain.number.estimation.reg", "domain.number.estimation.clu", "domain.number.estimation.ov", "domain.number.estimation.env", "domain.number.estimation.dom", "domain.number.estimation.rep", "domain.number.estimation.inc", "description.of.target")
	numeric.cols <- which(col.names == "full.sequence.E.value")        : which(col.names == "domain.number.estimation.exp")
	integer.cols <- which(col.names == "domain.number.estimation.reg") : which(col.names == "domain.number.estimation.inc")
	readLines(fname) %>%
		data.frame(line = .) %>%
		filter(!grepl("^#", line)) %>%
		separate(line, into = col.names, sep = " +", extra = "merge", convert = F) %>%
		mutate_at(numeric.cols, as.numeric) %>%
		mutate_at(integer.cols, as.integer)
}

data <- lapply(hmmsearch_files, read.hmmer.tblout) %>%
	bind_rows %>%
	extract(description.of.target, into = c("start", "end"), regex = "(\\d+) - (\\d+)", convert = T) %>%
	extract(target.name, into = "scaffold", regex = "^(.+)_\\d+$", remove = F) %>%
	extract(query.name, into = "gene", regex = "^([^_]+)", remove = F) %>%
	arrange(best.1.domain.E.value) %>%
	select(target.name, scaffold, query.name, gene, best.1.domain.E.value, best.1.domain.score, start, end) %>%
	filter(best.1.domain.E.value < E.value.threshold) %>%
	distinct(target.name, .keep_all = T)

relations <- left_join(data, data, by = "scaffold") %>%
	filter(start.x < start.y) %>%
	mutate(distance = pmin(abs(start.x - start.y), abs(start.x - end.y), abs(end.x - start.y), abs(end.x - end.y))) %>%
	filter(distance < distance.threshold) %>%
	arrange(start.x, start.y) %>%
	distinct(target.name.x, .keep_all = T) %>%
	select(target.name.x, target.name.y, scaffold, start.x, start.y, distance)

g <- graph_from_data_frame(relations, vertices = data)
V(g)$membership <- components(g)$membership

genes <- as_data_frame(g, "vertices") %>%
	mutate(molecule = paste(scaffold, membership)) %>%
	group_by(molecule) %>%
	mutate(n_genes = n_distinct(gene)) %>%
	filter(n_genes > 1)

group_by(genes, scaffold, membership) %>%
	summarize(start = min(start, end) - 1, end = max(start, end), .groups = "drop") %>%
	select(scaffold, start, end) %>%
	write.table(bed_file, row.names = F, col.names = F, quote = F, sep = "\t")

R ggplot2 dplyr tidyr igraph From line 2 of scripts/parse_hmmsearch.R

from lxml import etree # Ubuntu Karmic package: python-lxml
import sys, re, os
import base64
from optparse import OptionParser
from io import IOBase
from six import string_types

import logging

log = logging.getLogger(__name__)


VERSION = '0.1.0' # keep in sync with setup.py

UNITS = ['pt','px','in','mm','cm']

PX_PER_INCH = 96    # http://wiki.inkscape.org/wiki/index.php/Units_In_Inkscape
                    # old inkscape versions are 90 or something else
MM_PER_INCH = 25.4

PT2IN = 1.0/72.0
IN2PT = 72.0
CM2PT = 72.0/2.54
PT2PX = 1.25
PX2PT = 1.0/1.25

relIRI_re = re.compile(r'url\(#(.*)\)')

def get_unit_attr(value):
    """ coordinate handling from http://www.w3.org/TR/SVG11/coords.html#Units
    """
    units = None # default (user)
    for unit_name in UNITS:
        if value.endswith(unit_name):
            log.debug("Units for {} are {}".format(value, unit_name))
            units = unit_name
            value = value[:-len(unit_name)]
            break
    val_float = float(value) # this will fail if units str not parsed
    return val_float, units

def convert_to_pixels(val, units):
    if units == 'px' or units is None:
        val_px = val
    elif units == 'pt':
        val_px = val*PT2PX
    elif units == 'in':
        val_px = val*IN2PT*PT2PX
    elif units == 'mm':
        val_px = val / MM_PER_INCH * PX_PER_INCH
    elif units == 'cm':
        val_px = val*CM2PT*PT2PX
    else:
        raise ValueError('unsupport unit conversion to pixels: %s'%units)
    log.debug("{} {} = {} px".format(val, units, val_px))
    return val_px

def fix_ids( elem, prefix, level=0 ):
    ns = '{http://www.w3.org/2000/svg}'

    if isinstance(elem.tag, string_types) and elem.tag.startswith(ns):

        tag = elem.tag[len(ns):]

        if 'id' in elem.attrib:
            elem.attrib['id'] = prefix + elem.attrib['id']

        # fix references (See http://www.w3.org/TR/SVGTiny12/linking.html#IRIReference )

        for attrib in elem.attrib.keys():
            value = elem.attrib.get(attrib,None)

            if value is not None:

                if attrib.startswith('{http://www.w3.org/1999/xlink}'):
                    relIRI = False
                else:
                    relIRI = True

                if (not relIRI) and value.startswith('#'): # local IRI, change
                    iri = value[1:]
                    value = '#' + prefix + iri
                    elem.attrib[attrib] = value
                elif relIRI:
                    newvalue = re.sub( relIRI_re, r'url(#'+prefix+r'\1)', value)
                    if newvalue != value:
                        elem.attrib[attrib] = newvalue

        # Do same for children

    for child in elem:
        fix_ids(child,prefix,level=level+1)

def export_images( elem, filename_fmt='image%03d', start_idx=1 ):
    """replace inline images with files"""
    ns = '{http://www.w3.org/2000/svg}'
    href = '{http://www.w3.org/1999/xlink}href'
    count = 0
    if isinstance(elem.tag, string_types) and elem.tag.startswith(ns):
        tag = elem.tag[len(ns):]
        if tag=='image':
            buf = etree.tostring(elem, pretty_print=True)
            im_data = elem.attrib[href]
            exts = ['png','jpeg']
            found = False
            for ext in exts:
                prefix = 'data:image/'+ext+';base64,'
                if im_data.startswith(prefix):
                    data_base64 = im_data[len(prefix):]
                    found = True
                    break
            if not found:
                raise NotImplementedError('image found but not supported')

            # decode data
            data = base64.b64decode(data_base64)

            # save data
            idx = start_idx + count
            fname = filename_fmt%idx + '.' + ext
            if os.path.exists(fname):
                raise RuntimeError('File exists: %r'%fname)
            with open(fname,mode='w') as fd:
                fd.write( data )

            # replace element with link
            elem.attrib[href] = fname
            count += 1

    # Do same for children
    for child in elem:
        count += export_images(child, filename_fmt=filename_fmt,
                               start_idx=(start_idx+count) )
    return count

header_str = """<?xml version="1.0" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
 "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Created with svg_stack (http://github.com/astraw/svg_stack) -->
"""

# ------------------------------------------------------------------
class Document(object):
    def __init__(self):
        self._layout = None

    def setLayout(self,layout):
        self._layout = layout

    def save(self, fileobj, debug_boxes=False, **kwargs):
        if self._layout is None:
            raise ValueError('No layout, cannot save.')
        accum = LayoutAccumulator(**kwargs)
        self._layout.render(accum, debug_boxes=debug_boxes)
        try:
            isfile = isinstance(fileobj, file)
        except NameError:
            isfile = isinstance(fileobj, IOBase)
        if isfile:
            fd = fileobj
            close = False
        else:
            fd = open(fileobj, mode='w')
            close = True
        buf = accum.tostring(pretty_print=True)

        fd.write(header_str)
        fd.write( buf.decode() )
        if close:
            fd.close()


class SVGFileBase(object):
    def __init__(self, fname):
        self._fname = fname
        self._root = etree.parse(fname).getroot()
        if self._root.tag != '{http://www.w3.org/2000/svg}svg':
            raise ValueError('expected file to have root element <svg:svg>')

        if 'height' in self._root.keys():
            height, height_units = get_unit_attr(self._root.get('height'))
            width, width_units = get_unit_attr(self._root.get('width'))
        else:
            # R svglite does not set the height and width attributes. Get them from the viewBox attribute.
            vbox = self._root.get('viewBox')
            _, _, width, height = vbox.split(' ')
            width =  float(width)
            height = float(height)
            width_units = height_units = 'px' # The default
        self._width_px = convert_to_pixels( width, width_units)
        self._height_px = convert_to_pixels( height, height_units)

        log.debug("Size of {} is {:.2f} x {:.2f} px".format(fname, self._width_px, self._height_px))
        self._orig_width_px = self._width_px
        self._orig_height_px = self._height_px
        self._coord = None # unassigned

    def get_root(self):
        return self._root

    def get_size(self,min_size=None,box_align=None,level=None):
        return Size(self._width_px, self._height_px)

    def _set_size(self, size):
        if self._width_px != size.width:
            log.warning("Changing width of {} from {:.2f} to {:.2f}".format(
                self._fname, self._width_px, size.width))
            self._width_px = size.width
        if self._height_px != size.height:
            log.warning("Changing height of {} from {:.2f} to {:.2f}".format(
                self._fname, self._height_px, size.height))
            self._height_px = size.height

    def _set_coord(self,coord):
        self._coord = coord

    def export_images(self, *args, **kwargs):
        export_images(self._root, *args, **kwargs)


class SVGFile(SVGFileBase):
    def __str__(self):
        return 'SVGFile(%s)'%repr(self._fname)


class SVGFileNoLayout(SVGFileBase):
    def __init__(self,fname,x=0,y=0):
        self._x_offset = x
        self._y_offset = y
        super(SVGFileNoLayout, self).__init__(fname)

    def _set_coord(self,coord):
        self._coord = (coord[0] + self._x_offset,
                       coord[1] + self._y_offset )

    def __str__(self):
        return 'SVGFileNoLayout(%s)'%repr(self._fname)


class LayoutAccumulator(object):
    def __init__(self):
        self._svgfiles = []
        self._svgfiles_no_layout = []
        self._raw_elements = []

    def add_svg_file(self, svgfile):
        assert isinstance(svgfile,SVGFile)
        if svgfile in self._svgfiles:
            raise ValueError('cannot accumulate SVGFile instance twice')
        self._svgfiles.append( svgfile )

    def add_svg_file_no_layout(self,svgfile):
        assert isinstance(svgfile,SVGFileNoLayout)
        if svgfile in self._svgfiles_no_layout:
            raise ValueError('cannot accumulate SVGFileNoLayout instance twice')
        self._svgfiles_no_layout.append( svgfile )

    def add_raw_element(self,elem):
        self._raw_elements.append( elem )

    def tostring(self, **kwargs):
        root = self._make_finalized_root()
        return etree.tostring(root, **kwargs)

    def _set_size(self, size):
        self._size = size

    def _make_finalized_root(self):
        # get all required namespaces and prefixes
        NSMAP = {None : 'http://www.w3.org/2000/svg',
                 'sodipodi':'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd',
                 }
        for svgfile in self._svgfiles:
            origelem = svgfile.get_root()
            for key, value in origelem.nsmap.items():
                if key in NSMAP:
                    assert value == NSMAP[key]
                    # Already in namespace dictionary
                    continue
                elif key == 'svg':
                    assert value == NSMAP[None]
                    # svg is the default namespace - don't insert again.
                    continue
                log.debug("adding {} to NSMAP at {}".format(value, key))
                NSMAP[key] = value

        root = etree.Element('{http://www.w3.org/2000/svg}svg',
                             nsmap=NSMAP)

        if 1:
            # inkscape hack
            root_defs = etree.SubElement(root,'{http://www.w3.org/2000/svg}defs')

        root.attrib['version']='1.1'
        fname_num = 0
        do_layout = True
        work_list=[]
        for svgfile in (self._svgfiles):
            work_list.append( (fname_num, do_layout, svgfile) )
            fname_num += 1
        do_layout = False
        for svgfile in (self._svgfiles_no_layout):
            work_list.append( (fname_num, do_layout, svgfile) )
            fname_num += 1
        for (fname_num, do_layout, svgfile) in work_list:
            origelem = svgfile.get_root()

            fix_id_prefix = 'id%d:' % fname_num
            elem = etree.SubElement(root,'{http://www.w3.org/2000/svg}g')

            elem.attrib['id'] = 'id{}'.format(fname_num)

            elem_sz = svgfile.get_size()
            width_px = elem_sz.width
            height_px = elem_sz.height

            # copy svg contents into new group
            for child in origelem:
                if 1:
                    # inkscape hacks
                    if child.tag == '{http://www.w3.org/2000/svg}defs':
                        # copy into root_defs, not into sub-group
                        log.debug("Copying element from {}".format(svgfile))
                        for subchild in child:
                            fix_ids( subchild, fix_id_prefix )
                            root_defs.append( subchild )
                        continue
                    elif child.tag == '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}:namedview':
                        # don't copy
                        continue
                    elif child.tag == '{http://www.w3.org/2000/svg}metadata':
                        # don't copy
                        continue
                elem.append(child)

            fix_ids( elem, fix_id_prefix )

            translate_x = svgfile._coord[0]
            translate_y = svgfile._coord[1]
            if do_layout:
                if svgfile._orig_width_px != width_px:
                    log.info("svgfile: {} id {}".format(svgfile, fix_id_prefix))
                    log.error("orig width {:.2f} != width_px {:.2f}".format(
                        svgfile._orig_width_px, width_px))
                    raise NotImplementedError('rescaling width not implemented '
                                              '(hint: set alignment on file %s)'%(
                        svgfile,))
                if svgfile._orig_height_px != height_px:
                    log.info("svgfile: {} id {}".format(svgfile, fix_id_prefix))
                    log.error("orig height {:.2f} != height_px {:.2f}".format(
                        svgfile._orig_height_px, height_px))
                    raise NotImplementedError('rescaling height not implemented '
                                              '(hint: set alignment on file %s)'%(
                        svgfile,))
            orig_viewBox = origelem.get('viewBox')
            if orig_viewBox is not None:
                # split by comma or whitespace
                vb_tup = orig_viewBox.split(',')
                vb_tup = [c.strip() for c in vb_tup]
                if len(vb_tup)==1:
                    # not separated by commas
                    vb_tup = orig_viewBox.split()
                assert len(vb_tup)==4
                vb_tup = [float(v) for v in vb_tup]
                vbminx, vbminy, vbwidth, vbheight = vb_tup
                sx = width_px / vbwidth
                sy = height_px / vbheight
                tx = translate_x - vbminx
                ty = translate_y - vbminy
                elem.attrib['transform'] = 'matrix(%s,0,0,%s,%s,%s)'%(sx, sy, tx, ty)
                log.debug("matrix xform ({}, 0, 0, {}, {}, {})".format(sx, sy, tx, ty))
            else:
                elem.attrib['transform'] = 'translate(%s,%s)'%(translate_x, translate_y)
                log.debug("Translating ({}, {})".format(translate_x, translate_y))
            root.append( elem )
        for elem in self._raw_elements:
            root.append(elem)

        root.attrib["width"] = repr(self._size.width)
        root.attrib["height"] = repr(self._size.height)

        return root


# ------------------------------------------------------------------
class Size(object):
    def __init__(self, width=0, height=0):
        self.width=width
        self.height=height

# directions for BoxLayout
LeftToRight = 'LeftToRight'
RightToLeft = 'RightToLeft'
TopToBottom = 'TopToBottom'
BottomToTop = 'BottomToTop'

# alignment values
AlignLeft = 0x01
AlignRight = 0x02
AlignHCenter = 0x04

AlignTop = 0x020
AlignBottom = 0x040
AlignVCenter = 0x080

AlignCenter = AlignHCenter | AlignVCenter

class Layout(object):
    def __init__(self, parent=None):
        if parent is not None:
            raise NotImplementedError('')


class BoxLayout(Layout):
    def __init__(self, direction, parent=None):
        super(BoxLayout, self).__init__(parent=parent)
        self._direction = direction
        self._items = []
        self._contents_margins = 0 # around edge of box
        self._spacing = 0 # between items in box
        self._coord = (0, 0) # default
        self._size = None # uncalculated

    def _set_coord(self,coord):
        self._coord = coord

    def render(self, accum, min_size=None, level=0, debug_boxes=0):
        size = self.get_size(min_size=min_size)
        if level==0:
            # set document size if top level
            accum._set_size(size)
        if debug_boxes>0:
            # draw black line around BoxLayout element
            debug_box = etree.Element('{http://www.w3.org/2000/svg}rect')
            debug_box.attrib['style']=(
                'fill: none; stroke: black; stroke-width: 2.000000;')
            sz=size
            debug_box.attrib['x']=repr(self._coord[0])
            debug_box.attrib['y']=repr(self._coord[1])
            debug_box.attrib['width']=repr(sz.width)
            debug_box.attrib['height']=repr(sz.height)
            accum.add_raw_element(debug_box)

        for (item, stretch, alignment, xml) in self._items:
            if isinstance( item, SVGFile ):
                accum.add_svg_file(item)

                if debug_boxes>0:
                    # draw red line around SVG file
                    debug_box= etree.Element('{http://www.w3.org/2000/svg}rect')
                    debug_box.attrib['style']=(
                        'fill: none; stroke: red; stroke-width: 1.000000;')
                    sz=item.get_size()
                    debug_box.attrib['x']=repr(item._coord[0])
                    debug_box.attrib['y']=repr(item._coord[1])
                    debug_box.attrib['width']=repr(sz.width)
                    debug_box.attrib['height']=repr(sz.height)
                    accum.add_raw_element(debug_box)
            elif isinstance( item, SVGFileNoLayout ):
                accum.add_svg_file_no_layout(item)

                if debug_boxes>0:
                    # draw green line around SVG file
                    debug_box= etree.Element('{http://www.w3.org/2000/svg}rect')
                    debug_box.attrib['style']=(
                        'fill: none; stroke: green; stroke-width: 1.000000;')
                    sz=item.get_size()
                    debug_box.attrib['x']=repr(item._coord[0])
                    debug_box.attrib['y']=repr(item._coord[1])
                    debug_box.attrib['width']=repr(sz.width)
                    debug_box.attrib['height']=repr(sz.height)
                    accum.add_raw_element(debug_box)

            elif isinstance( item, BoxLayout ):
                item.render( accum, min_size=item._size, level=level+1,
                             debug_boxes=debug_boxes)
            else:
                raise NotImplementedError(
                    "don't know how to accumulate item %s"%item)

            if xml is not None:
                extra = etree.Element('{http://www.w3.org/2000/svg}g')
                extra.attrib['transform'] = 'translate(%s,%s)'%(
                    repr(item._coord[0]),repr(item._coord[1]))
                extra.append(xml)
                accum.add_raw_element(extra)

    def get_size(self, min_size=None, box_align=0, level=0):
        cum_dim = 0 # size along layout direction
        max_orth_dim = 0 # size along other direction

        if min_size is None:
            min_size = Size(0, 0)

        # Step 1: calculate required size along self._direction
        if self._direction in [LeftToRight, RightToLeft]:
            max_orth_dim = min_size.height
            dim_min_size = Size(width=0,
                                height=max_orth_dim)
        else:
            max_orth_dim = min_size.width
            dim_min_size = Size(width=max_orth_dim,
                                height=0)

        cum_dim += self._contents_margins # first margin
        item_sizes = []
        for item_number,(item,stretch,alignment,xml) in enumerate(self._items):
            if isinstance(item, SVGFileNoLayout):
                item_size = Size(0, 0)
            else:
                item_size = item.get_size(min_size=dim_min_size, box_align=alignment,level=level+1)
            item_sizes.append( item_size )

            if isinstance(item, SVGFileNoLayout):
                # no layout for this file
                continue

            if self._direction in [LeftToRight, RightToLeft]:
                cum_dim += item_size.width
                max_orth_dim = max(max_orth_dim,item_size.height)
            else:
                cum_dim += item_size.height
                max_orth_dim = max(max_orth_dim,item_size.width)

            if (item_number+1) < len(self._items):
                cum_dim += self._spacing # space between elements
        cum_dim += self._contents_margins # last margin
        orth_dim = max_orth_dim # value without adding margins
        max_orth_dim += 2*self._contents_margins # margins

        # ---------------------------------

        # Step 2: another pass in which expansion takes place
        total_stretch = 0
        for item,stretch,alignment,xml in self._items:
            total_stretch += stretch
        if (self._direction in [LeftToRight, RightToLeft]):
            dim_unfilled_length = max(0,min_size.width - cum_dim)
        else:
            dim_unfilled_length = max(0,min_size.height - cum_dim)

        stretch_hack = False
        if dim_unfilled_length > 0:
            if total_stretch == 0:
                # BoxLayout in which stretch is 0, but unfilled space
                # exists.

                # XXX TODO: what is Qt policy in this case?
                stretch_hack = True
                stretch_inc = 0
            else:
                stretch_inc = dim_unfilled_length / float(total_stretch)
        else:
            stretch_inc = 0

        cum_dim = 0 # size along layout direction
        cum_dim += self._contents_margins # first margin
        is_last_item = False
        for i,(_item,old_item_size) in enumerate(zip(self._items,item_sizes)):
            if (i+1) >= len(self._items):
                is_last_item=True
            (item,stretch,alignment,xml) = _item
            if (self._direction in [LeftToRight, RightToLeft]):
                new_dim_length = old_item_size.width + stretch*stretch_inc
                if stretch_hack and is_last_item:
                    new_dim_length = old_item_size.width + dim_unfilled_length
                new_item_size = Size( new_dim_length, orth_dim )
            else:
                new_dim_length = old_item_size.height + stretch*stretch_inc
                if stretch_hack and is_last_item:
                    new_dim_length = old_item_size.width + dim_unfilled_length
                new_item_size = Size( orth_dim, new_dim_length )

            if isinstance(item, SVGFileNoLayout):
                item_size = Size(0,0)
            else:
                item_size = item.get_size(min_size=new_item_size, box_align=alignment,level=level+1)
            if self._direction == LeftToRight:
                child_box_coord = (cum_dim, self._contents_margins)
            elif self._direction == TopToBottom:
                child_box_coord = (self._contents_margins, cum_dim)
            else:
                raise NotImplementedError(
                    'direction %s not implemented'%self._direction)
            child_box_coord = (child_box_coord[0] + self._coord[0],
                               child_box_coord[1] + self._coord[1])
            child_box_size = new_item_size

            item_pos, final_item_size = self._calc_box( child_box_coord, child_box_size,
                                                        item_size,
                                                        alignment )
            # FIXME : don't call internal funtion on another class
            # FIXME : get_size should not set size
            item._set_coord( item_pos )
            item._set_size( final_item_size )

            if self._direction in [LeftToRight, RightToLeft]:
                # Use requested item size so ill behaved item doesn't
                # screw up layout.
                cum_dim += new_item_size.width
            else:
                # Use requested item size so ill behaved item doesn't
                # screw up layout.
                cum_dim += new_item_size.height

            if not is_last_item:
                cum_dim += self._spacing # space between elements
        cum_dim += self._contents_margins # last margin

        # ---------------------------------

        # Step 3: calculate coordinates of each item

        if self._direction in [LeftToRight, RightToLeft]:
            size = Size(cum_dim, max_orth_dim)
        else:
            size = Size(max_orth_dim, cum_dim)

        self._size = size
        return size

    def _calc_box(self, in_pos, in_sz, item_sz, alignment):
        if (AlignLeft & alignment):
            left = in_pos[0]
            width = item_sz.width
        elif (AlignRight & alignment):
            left = in_pos[0]+in_sz.width-item_sz.width
            width = item_sz.width
        elif (AlignHCenter & alignment):
            left = in_pos[0]+0.5*(in_sz.width-item_sz.width)
            width = item_sz.width
        else:
            # expand
            left = in_pos[0]
            width = in_sz.width

        if (AlignTop & alignment):
            top = in_pos[1]
            height = item_sz.height
        elif (AlignBottom & alignment):
            top = in_pos[1]+in_sz.height-item_sz.height
            height = item_sz.height
        elif (AlignVCenter & alignment):
            top = in_pos[1]+0.5*(in_sz.height-item_sz.height)
            height = item_sz.height
        else:
            # expand
            top = in_pos[1]
            height = in_sz.height

        pos = (left,top)
        size = Size(width,height)
        return pos,size

    def _set_size(self, size):
        self._size = size

    def setSpacing(self,spacing):
        self._spacing = spacing

    def addSVG(self, svg_file, stretch=0, alignment=0, xml=None):
        if not isinstance(svg_file, SVGFile):
            svg_file = SVGFile(svg_file)
        if xml is not None:
            xml = etree.XML(xml)
        self._items.append((svg_file, stretch, alignment, xml))

    def addSVGNoLayout(self, svg_file, x=0, y=0, xml=None):
        if not isinstance(svg_file,SVGFileNoLayout):
            svg_file = SVGFileNoLayout(svg_file,x=x,y=y)
        stretch=0
        alignment=0
        if xml is not None:
            xml = etree.XML(xml)
        self._items.append((svg_file,stretch,alignment,xml))

    def addLayout(self, layout, stretch=0):
        assert isinstance(layout, Layout)
        alignment=0 # always expand a layout
        xml=None
        self._items.append((layout, stretch, alignment, xml))


class HBoxLayout(BoxLayout):
    def __init__(self, parent=None):
        super(HBoxLayout, self).__init__(LeftToRight, parent=parent)


class VBoxLayout(BoxLayout):
    def __init__(self, parent=None):
        super(VBoxLayout, self).__init__(TopToBottom, parent=parent)

# ------------------------------------------------------------------

def main():
    usage = '''%prog FILE1 [FILE2] [...] [options]

concatenate SVG files

This will concatenate FILE1, FILE2, ... to a new svg file printed to
stdout.

'''

    parser = OptionParser(usage, version=VERSION)
    parser.add_option("--margin",type='str',
                      help='size of margin (in any units, px default)',
                      default=None)
    parser.add_option("--direction",type='str',
                      default='vertical',
                      help='horizontal or vertical (or h or v)')
    (options, args) = parser.parse_args()
    fnames = args

    if options.direction.lower().startswith('v'):
        direction = 'vertical'
    elif options.direction.lower().startswith('h'):
        direction = 'horizontal'
    else:
        raise ValueError('unknown direction %s'%options.direction)

    if options.margin is not None:
        margin_px = convert_to_pixels(*get_unit_attr(options.margin))
    else:
        margin_px = 0

    if 0:
        fd = open('tmp.svg',mode='w')
    else:
        fd = sys.stdout

    doc = Document()
    if direction == 'vertical':
        layout = VBoxLayout()
    elif direction == 'horizontal':
        layout = HBoxLayout()

    for fname in fnames:
        layout.addSVG(fname, alignment=AlignCenter)

    layout.setSpacing(margin_px)
    doc.setLayout(layout)
    doc.save( fd )

if __name__=='__main__':
    main()

Python lxml six From line 23 of scripts/svg_stack.py

library(dplyr)
library(tidyr)
library(tools)
library(ggraph)
library(ggforce)
library(igraph)
library(tidyverse)

if (interactive()) {
    setClass("snakemake", slots = list(input = "list", output = "list", params = "list", wildcards = "list"))
    snakemake <- new("snakemake",
        input  = list(
            network  = "bak2/analysis/vcontact2/results/c1.ntw",
            genomes  = "bak2/analysis/vcontact2/results/genome_by_genome_overview.csv",
            profiles = "bak2/analysis/vcontact2/results/vConTACT_profiles.csv",
            segments = "metadata/viral_segments.tsv",
            viruses  = "metadata/viruses.tsv",
            colors   = "metadata/subclade_colors.txt"
        ),
        output = list(
            clustering = "bak2/output/vcontact2_clustering.svg"
        )
    )
}
with(snakemake@input, {
    network_file  <<- network
    genomes_file  <<- genomes
    profiles_file <<- profiles
    segments_file <<- segments
    viruses_file  <<- viruses
    colors_file   <<- colors
})

with(snakemake@output, {
    clustering_file <<- clustering
})

segments <- read.table(segments_file, sep = "\t", header = T)
viruses  <- bind_rows(
    read.table(segments_file, sep = "\t", header = T),
    read.table(viruses_file, sep = "\t", header = T)
)
subclades <- with(viruses, setNames(subclade, short))

sort_components <- function(.graph) {
    decompose(.graph) %>%
        `[`(order(sapply(., length), decreasing = T))
}

genomes <- read.csv(genomes_file, na.strings = "") %>%
    group_by(VC) %>%
    mutate(Cluster = ifelse(is.na(VC), Genome, paste(VC, paste(Genome, collapse = ", ")))) %>%
    left_join(viruses, by = c(Genome = "short"))
subclusters <- with(genomes, setNames(VC.Subcluster, Genome))

self_edges <- with(genomes, data.frame(A = Genome, B = Genome, weight = 1e-10))
network <- read.table(network_file, col.names = c("A","B","weight")) %>%
    bind_rows(self_edges) %>%
    mutate(weight = weight / 1000) %>%
    graph_from_data_frame(directed = F) %>%
    set_vertex_attr("subclade", value = subclades[V(.)$name]) %>%
    set_vertex_attr("subcluster", value = subclusters[V(.)$name])

colors <- read.table(colors_file, comment.char = "") %>%
    with(setNames(V2,V1)) %>%
    c("darkgray")

p <- ggraph(network, "fr") +
    geom_edge_link(aes(color = -weight, width = weight), alpha = .1) +
    geom_node_point(aes(color = subclade), size = 2) +
    geom_node_text(aes(label = ifelse(is.na(subcluster), name, sprintf("%s\n(%s)", name, subcluster))), repel = T, nudge_y = -0.01) +
    scale_color_manual(values = colors) # +
    #theme_graph()
ggsave(clustering_file, p, width = 10, height = 10)

R Snakemake tidyverse dplyr tidyr igraph ggforce ggraph From line 2 of scripts/vcontact2.R

script:
    "scripts/extract_element.py"

SnakeMake From line 16 of workflow/Segments_extract.snakefile

script:
    "scripts/mcl_add_to_gbk.py"

SnakeMake From line 27 of workflow/Segments_extract.snakefile

shell:
    "getorf -minsize 100 -maxsize 100000 -filter {input} > {output}"

SnakeMake From line 13 of workflow/Segments_search.snakefile

shell:
    "hmmsearch -o /dev/null --tblout {output} {input.hmm} {input.fasta}"

SnakeMake hmmsearch (genouest) From line 23 of workflow/Segments_search.snakefile

shell:
    """
    awk '!/^#/&&$5<{params.evalue}{{print $1,$3}}' OFS=\\\\t {input.txt} | \
        parallel --colsep \\\\t seqkit faidx -f {input.fasta} {{1}} \| seqkit replace -p "'$'" -r '" {{2}}"' | seqkit  seqkit seq -gm 100
    """

SnakeMake seqkit From line 37 of workflow/Segments_search.snakefile

script:
    "scripts/parse_hmmsearch.R"

SnakeMake From line 55 of workflow/Segments_search.snakefile

shell:
    "seqkit subseq --up-stream {params.flanks} --down-stream {params.flanks} --bed {input.bed} {input.genome} | cut -f1 -d: > {output}"

SnakeMake seqkit subSeq From line 67 of workflow/Segments_search.snakefile

shell:
    """
    seqkit split -iO {output} {input}
    rename s/segments.id_// {output}/*.fna
    """

SnakeMake seqkit From line 77 of workflow/Segments_search.snakefile

shell:
    "gmsn.pl --format GFF3 --virus --output {output} {input}"

SnakeMake From line 90 of workflow/Segments_search.snakefile

shell:
    "prodigal -i {input} -m -g 1 -p meta -f gff > {output}"

SnakeMake prodigal From line 102 of workflow/Segments_search.snakefile

shell:
    "gffcompare -p {params.cprefix} -CTo {params.outprefix} {input}"

SnakeMake gffcompare From line 117 of workflow/Segments_search.snakefile

shell:
    "gffread -g {input.fna} -w - -o {output.gff} {input.gtf} | seqkit translate --trim -o {output.faa}"

SnakeMake seqkit gffread From line 130 of workflow/Segments_search.snakefile

shell:
    "cat {input} > {output}"

SnakeMake From line 145 of workflow/Segments_search.snakefile

shell:
    "cat {input} > {output}"

SnakeMake From line 153 of workflow/Segments_search.snakefile

shell:
    "hmmscan --cpu {threads} -o /dev/null --tblout {output.tblout} --domtblout {output.domtblout} {input.hmm} {input.fasta}"

SnakeMake hmmscan (EBI) From line 167 of workflow/Segments_search.snakefile

shell:
    "hmmscan --cpu {threads} -o /dev/null --tblout {output.tblout} --domtblout {output.domtblout} {input.hmm} {input.fasta}"

SnakeMake hmmscan (EBI) From line 181 of workflow/Segments_search.snakefile

shell:
    "hmmscan --cpu {threads} -o /dev/null --tblout {output.tblout} --domtblout {output.domtblout} {input.hmm} {input.fasta}"

SnakeMake hmmscan (EBI) From line 195 of workflow/Segments_search.snakefile

shell:
    "hmmsearch -o /dev/null --tblout {output} {input.hmm} {input.fasta}"

SnakeMake hmmsearch (genouest) From line 205 of workflow/Segments_search.snakefile

shell:
    "pfam_scan.pl -fasta {input} -dir {params.pfam_dir} -outfile {output} -cpu {threads}"

SnakeMake From line 219 of workflow/Segments_search.snakefile

shell:
    "blastn -query {input.fna} -db {input.fna} -outfmt 6 -out {output} -evalue 1e-20"

SnakeMake BLAST From line 230 of workflow/Segments_search.snakefile

script:
    "scripts/gff2gbk.py"

SnakeMake From line 256 of workflow/Segments_search.snakefile

run:
    print(f"{bcolors.FAIL}No task selected - choose one of %s\nKeep in mind that all rules except search_segments are dependent on extract_segments{bcolors.ENDC}\n" % ', '.join(exposed_rules))

SnakeMake From line 82 of workflow/Snakefile

shell:
    "makeblastdb -in {input} -dbtype prot"

SnakeMake From line 92 of workflow/Snakefile

shell:
    "makeblastdb -in {input} -dbtype nucl"

SnakeMake From line 102 of workflow/Snakefile

shell:
    "seqkit faidx {input}"

SnakeMake seqkit From line 112 of workflow/Snakefile

shell:
    "seqkit faidx -f {input}"

SnakeMake seqkit From line 122 of workflow/Snakefile

shell:
    "mad {input}"

SnakeMake From line 130 of workflow/Snakefile

shell:
    "cat {input} > {output}"

SnakeMake From line 13 of workflow/Vcontact.snakefile

shell:
    """
    parallel --tagstring {{/.}} seqkit seq -ni {{}} ::: {input} | awk -vOFS=, 'BEGIN{{print"protein_id","contig_id","keywords"}}{{print$2,$1,""}}' > {output}
    """

SnakeMake seqkit From line 24 of workflow/Vcontact.snakefile

shell:
    "vcontact2 --threads {threads} --db {params.db} --raw-proteins {input.fasta} --rel-mode {params.rel_mode} --proteins-fp {input.mapping} --pcs-mode {params.pcs_mode} --vcs-mode {params.vcs_mode} --c1-bin $CONDA_PREFIX/lib/cluster_one-v1.0.jar --output-dir {output.outdir}"