Robust Optogenetic Inhibition with Red-light-sensitive Anion-conducting Channelrh

public 1yr ago 0 bookmarks

View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

ACRs

This repository hosts the bioinformatic analysis worklow used in Oppermann et al (2023) Robust Optogenetic Inhibition with Red-light-sensitive Anion-conducting Channelrhodopsins, bioRxiv .

The snakemake -based code for the workflow is under workflow/ . The dependencies are under conda control ( snakemake --use-conda ), see workflow/envs . The analysis files are under analysis/ , in particular ACRs/analysis/pdb contains the output files for the structural alignment ( sequences.aln is the main output) and analysis/all hosts the various files generated for the phylogenetic analysis:

sequences.fasta : unaligned sequences
cdhit.fasta : non-redundant set of sequences
mafft.fasta : mafft alignment
trimal.fasta : trimal trimmed alignment
iqtree.treefile : the phylogenetic tree from iqtree with ultrafasta bootstrap support values

The metadata for the sequences can be found in metadata/Channelrhodopsins_Updated_List.xlsx which is a snapshot of the Catalogue of Natural Channelrhodopsins .

Code Snippets

library(dplyr)
library(tidyr)
library(bio3d)

template_file <- unlist(snakemake@input)
output_file <- unlist(snakemake@output)

templates <- read.table("pdb/template.txt", col.names = c("ID", "_P_", "fname")) %>%
    mutate(ID = substring(ID, 2))
data <- lapply(templates$fname, read.pdb) %>%
    setNames(templates$ID)
sheets <- lapply(data, `[[`, "sheet") %>%
    lapply(data.frame) %>%
    setNames(templates$ID) %>%
    bind_rows(.id = "ID") %>%
    mutate(sense = ifelse("sense" %in% names(.), as.numeric(sense), NA), sense = ifelse(sense < 0, "-", "+"))
helices <- lapply(data, `[[`, "helix") %>%
    lapply(data.frame) %>%
    setNames(templates$ID) %>%
    bind_rows(.id = "ID")
list(sheet = sheets, helix = helices) %>%
    bind_rows(.id = "ss") %>%
    filter(chain == "A") %>%
    replace_na(list(sense = ".")) %>%
    mutate(source = "SS", score = ".", frame = ".", attrib = ".") %>%
    select(ID, source, ss, start, end, score, sense, frame, attrib) %>%
    write.table(output_file, quote = F, sep = "\t", col.names = F, row.names = F)

R dplyr tidyr Bio3D From line 1 of scripts/features.R

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

template_txt = str(snakemake.input)
fasta_file = str(snakemake.output)

aminoacids = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLN": "Q",
    "GLU": "E",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V",
    "LYR": "K"
}

def to_fasta(fname):
    has_seqres = False
    seq = []
    with open(fname) as fh:
        for line in fh:
            if line.startswith('SEQRES'):
                has_seqres = True
                tag, row_num, chain, length, *residues = line.split()
                if chain == 'A':
                    seq += [ aminoacids[x] for x in residues ]
            elif not has_seqres and line.startswith('ATOM'):
                tag, atom_num, atom, residue, chain, residue_num, *rest = line.split()
                if chain == 'A' and atom == 'N':
                    seq.append(aminoacids[residue])
    return ''.join(seq)

with open(fasta_file, 'w') as out:
    with open(template_txt) as fh:
        for line in fh:
            name, tag, pdb_file = line.split()
            seq = to_fasta(pdb_file)
            record = SeqRecord(Seq(seq), name[1:], description = '')
            SeqIO.write(record, out, 'fasta')

Python Biopython From line 2 of scripts/pdb2fasta.py

library(dplyr)
library(tidyr)
library(treeio)
library(ggtree)
library(readxl)
library(photobiology)
library(ggplot2)
library(ggnewscale)
library(castor)
library(ape)

with(snakemake@input, {
    tree_file <<- tree
    metadata_file <<- metadata
})

output_file <- unlist(snakemake@output)

to_treedata <- function(tree) {
    class(tree) <- c("tbl_tree", "tbl_df", "tbl", "data.frame")
    as.treedata(tree)
}

add_hsp <- function(tree, colname) {
    colname <- deparse(substitute(colname))
    treedata <- to_treedata(tree)
    categories <- setNames(tree[[colname]], tree[["label"]]) %>%
        `[`(treedata@phylo$tip.label) %>%
        as.factor
    hsp <- hsp_max_parsimony(treedata@phylo, as.numeric(categories), edge_exponent = 0.1) %>%
        `$`("likelihoods") %>%
        as.data.frame %>%
        setNames(levels(categories)) %>%
        mutate(node = 1:n()) %>%
        gather(value, likelihood, -node) %>%
        filter(likelihood > 0.99) %>%
        setNames(c("node", paste0(colname, "_hsp"), paste0(colname, "_hsp_lh")))
    left_join(tree, hsp, by = "node")
}
get_mrca <- function(phylo, tips) {
    getMRCA(phylo, tips) %>%
        replace(is.null(.), NA)
}
add_mrca <- function(tree, colname) {
    colname <- deparse(substitute(colname))
    treedata <- to_treedata(tree)
    mrca <- mutate(tree, my_column = !!as.name(colname)) %>%
        group_by(my_column) %>%
        mutate(is.tip = label %in% treedata@phylo$tip.label) %>%
        mutate(no_data = all(is.na(my_column))) %>%
        mutate(mrca = get_mrca(treedata@phylo, node[is.tip])) %>%
        mutate(mrca = ifelse(no_data | is.na(mrca), node, mrca)) %>%
        group_by(mrca) %>%
        mutate(enough_tips = sum(is.tip) > 1) %>%
        mutate(ifelse(node == mrca & enough_tips, first(na.omit(my_column)), NA)) %>%
        pull
    tree[[paste0(colname, "_mrca")]] <- mrca
    return(tree)
}

metadata <- read_xlsx(metadata_file, .name_repair = "universal") %>%
    mutate(Sequence.name = sub(",.+", "", Sequence.name)) %>%
    mutate(Sequence.name = gsub("@", "_", Sequence.name)) %>%
    mutate(Maximum..nm = ifelse(is.na(Action.maximum..nm), Absorption.maximum..nm, Action.maximum..nm)) 

tree <- read.iqtree(tree_file) %>%
    as_tibble %>%
    left_join(metadata, by = c(label = "Sequence.name")) %>%
    mutate(Color = unname(w_length2rgb(Maximum..nm))) %>%
    mutate(Category = case_when(Currents %in% c("no photocurrents", "channel") ~ NA_character_, T ~ gsub("[][]", "", Currents))) %>%
    mutate(Symbol = gsub(",.+", "", Symbol)) %>%
    mutate(Symbol_show = ifelse(Currents.reference == "[Oppermann23](in_prep)", Symbol, NA)) %>%
    add_mrca(ChR.group) %>%
    add_hsp(Category)

cat_colors <- list(
    "anion channel" = "indianred",
    "cation channel" = "deepskyblue",
    "potassium channel" = "purple",
    "channel" = "yellow4"
)
p <- ggtree(to_treedata(tree), aes(color = Category_hsp), layout = "ape") +
    scale_color_manual(values = cat_colors) + new_scale_color() +
    geom_nodepoint(aes(x = branch.x, y = branch.y, subset = !is.na(UFboot) & UFboot >= 95), size = 0.2, color = "#4d4d4dff") +
    geom_nodepoint(aes(x = branch.x, y = branch.y, subset = !is.na(UFboot) & UFboot >= 90 & UFboot < 95), size = 0.2, color = "#b3b3b3ff") +
    geom_tippoint(aes(subset = !is.na(Category) & is.na(Color)), color = "darkgray") +
    geom_tippoint(aes(subset = !is.na(Color), color = Color)) + scale_colour_identity() + new_scale_color() +
    geom_tiplab2(aes(label = Symbol_show), hjust = -0.2) +
    geom_treescale(width = 0.5) +
    geom_cladelab(mapping = aes(subset = !is.na(ChR.group_mrca), node = node, label = ChR.group_mrca), offset = -0.1) +
    xlim(-10, 10)
ggsave(output_file, p, width = 7, height = 7)

R ggplot2 dplyr tidyr readxl APE ggtree treeio ggnewscale castor photobiology From line 1 of scripts/plot_tree.R

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

with open(str(snakemake.output), 'w') as file:
    for name, seq in snakemake.params['seq'].items():
        record = SeqRecord(Seq(seq), id = name)
        SeqIO.write(record, file, 'fasta')

Python Biopython From line 2 of scripts/write_fasta.py

script:
    "scripts/write_fasta.py"

SnakeMake From line 28 of workflow/Snakefile

shell:
    "mafft --thread {threads} --auto {input} > {output}"

SnakeMake MAFFT API (EBI) From line 40 of workflow/Snakefile

shell:
    "cdhit -i {input} -o {output} -c {params.c} -d 0"

SnakeMake From line 52 of workflow/Snakefile

shell:
    "trimal -in {input} -out {output} -gt {params.gt}"

SnakeMake trimAl From line 64 of workflow/Snakefile

shell:
    "gotree reroot midpoint -i {input} -o {output}"

SnakeMake Gotree From line 74 of workflow/Snakefile

shell:
    "iqtree2 -seed {params.seed} -s {input} -B {params.B} -T {threads} -pers {params.pers} -nstop {params.nstop} --prefix {params.prefix} -redo"

SnakeMake From line 92 of workflow/Snakefile

script:
    "scripts/pdb2fasta.py"

SnakeMake From line 102 of workflow/Snakefile

shell:
    "t_coffee {input.sequences} -outfile {output.aln} -newtree {output.dnd} -method {params.method} -template_file {input.template} -pdb_min_sim {params.pdb_min_sim} -pdb_min_cov {params.pdb_min_cov} -n_core {threads}"