Performance analyses of classifier ensembles on peptide encodings

public 1yr ago 0 bookmarks

View Workflow

image info

Overview image illustrating the performance analyses of classifier ensembles on peptide encodings. An arbitrary number of encoded datasets can be processed (arrows). The workflow conducts preprocessing (a), ensemble pruning (b), training/testing (c), and processing of results (d) using a 100-fold Monte Carlo cross-validation. Manuscript submitted for publication.

Execution

To run all experiments, execute snakemake --cores 32 --quiet .

In order to execute the ensemble pruning using the Decision Tree classifier, stacking as the meta-model, with 5 folds, run snakemake data/temp/avp_amppred/ensemble_pfront/stacking/dt/{0,1,2,3,4}.csv --cores 32 --quiet

This example will run the pruning for the avp_amppred dataset.

Code Snippets

Sys.setlocale("LC_NUMERIC","en_US.UTF-8")

library(scmamp)
library(yaml)

paths <- snakemake@input

d <- read.csv(paths[[1]])
d["X"] <- NULL
for (idx in 2:length(paths)) {
  df = read.csv(paths[[idx]])
  df["X"] <- NULL
  d <- rbind(d, df)
}

d <- d[(d$cat != "mvo"), ]

for (c in unique(d[["cat"]])) {
  d[d["cat"] == c, "idx"] <- 1:sum(d["cat"] == c)
}

df_cats <- reshape(d[, -2:-4], idvar = "idx", timevar = "cat", direction = "wide")
df_cats$idx <- NULL
colnames(df_cats) <- gsub("mcc.", "", colnames(df_cats))

# plotCD(results.matrix = df_cats, alpha = 0.05)

ar_cats <- sort(colMeans(rankMatrix(df_cats)))

nm_cats <- nemenyiTest(df_cats)
rownames(nm_cats$diff.matrix) <- colnames(nm_cats$diff.matrix)

lists <- apply(
  X = expand.grid(unique(d[["model"]]), unique(d[["meta_model"]])),
  MARGIN = 1,
  FUN = function(row) {
    df <- data.frame(d[d["model"] == row[1] & d["meta_model"] == row[2], "mcc"])
    colnames(df) <- paste0(row[1], "_", row[2])
    return(df)
  }
)

df_models <- do.call(cbind, lists)

ar_models <- sort(colMeans(rankMatrix(df_models)))

nm_models <- nemenyiTest(df_models)
rownames(nm_models$diff.matrix) <- colnames(nm_models$diff.matrix)

lres <- list(
  cats = list(
    cd = nm_cats$statistic,
    average_ranking = ar_cats,
    names = names(ar_cats)
  ),
  models = list(
    cd = nm_models$statistic,
    average_ranking = ar_models,
    names = names(ar_models)
  )
)

write_yaml(lres, snakemake@output[[1]])

R yaml From line 1 of scripts/cd.R

import pandas as pd
import altair as alt
import numpy as np

df_res = pd.read_csv(snakemake.input[0], index_col=0)

df_out = pd.DataFrame()
for m in df_res.model.unique():
    df_tmp = df_res.loc[df_res.model == m]
    df_tmp = df_tmp.loc[np.bitwise_not(
        df_tmp.ensemble_mvo |
        df_tmp.ensemble_best |
        df_tmp.ensemble_rand |
        df_tmp.ensemble_chull |
        df_tmp.ensemble_pfront
    ) & (df_tmp.chull_complete == -1)]
    df_tmp = pd.concat([
        pd.DataFrame({"variable": df_tmp.x, "type": "kappa", "model": m}),
        pd.DataFrame({"variable": df_tmp.y, "type": "error", "model": m})
    ])
    df_out = pd.concat([df_out, df_tmp])

chart = alt.Chart(df_out).mark_boxplot(
    color="grey",
    size=15
).encode(
    x=alt.X("type:N", title=None, axis=None),
    y=alt.Y("variable:Q", title=None),
    color=alt.Color(
        "type:N", title="Type",
        scale=alt.Scale(scheme="greys")
    ),
    column=alt.Column("model:N", title="Model", spacing=2)
).properties(
    width=50,
    height=100
)

chart.save(snakemake.output[0])  # html

Python Pandas numpy altair From line 1 of plots/box_plot_manova.py

import pandas as pd
import altair as alt

from glob import glob

df_res = pd.DataFrame()
# for p in glob("data/ensembles_res/*/*/*.csv"):
for p in list(snakemake.input.ensemble_res):
    df_tmp = pd.read_csv(p, index_col=0)
    df_res = pd.concat([df_res, df_tmp])

df_res_single = pd.DataFrame()
for p in list(snakemake.input.single_res):
    df_tmp = pd.read_csv(p, index_col=0)
    df_res_single = pd.concat([df_res_single, df_tmp])

c1 = alt.Chart(df_res).mark_boxplot(
    size=8, color="#000000", opacity=1.0, outliers={"size": 0}, median=False
).encode(
    x=alt.X("meta_model:N", title=None, axis=alt.Axis(labelAngle=-35, grid=True)),
    y=alt.Y(
        "mcc:Q",
        scale=alt.Scale(domain=[0.0, 1.0]),
        axis=alt.Axis(values=[0.1, 0.3, 0.5, 0.7, 0.9], title=None)
    ),
).properties(
    width=100,
    height=100
).facet(
    row=alt.Row("model:N", title=None),
    column=alt.Column("cat:N", title=None, sort=["pfront", "chull", "mvo", "best", "rand"]),
    spacing=1
)

c2 = alt.Chart(df_res_single).mark_boxplot(
    size=8, color="#000000", opacity=1.0, outliers={"size": 0}, median=False
).encode(
    x=alt.X("rank:N", title=None, axis=alt.Axis(labelAngle=-35, grid=True)),
    y=alt.Y(
        "mcc:Q",
        scale=alt.Scale(domain=[0.0, 1.0]),
        axis=alt.Axis(values=[0.1, 0.3, 0.5, 0.7, 0.9], title=None, orient="right")
    ),
).properties(
    width=100,
    height=100
).facet(
    row=alt.Row("model:N", title=None, header=alt.Header(title=None, labels=False)),
    column=alt.Column("cat:N", title=None, sort=["pfront", "chull", "mvo", "best", "rand", "single_best"]),
    spacing=1
)

chart = alt.hconcat(
    c1, c2, spacing=0.7
).configure_header(
    labelFontSize=14
).configure_axis(
    labelFontSize=12
)

chart.save(snakemake.output[0], vegalite_version="5.1.0")  # html
chart.save(snakemake.output[1], vegalite_version="5.1.0")  # png

Python Pandas altair From line 1 of plots/box_plot.py

from more_itertools import chunked

import altair as alt
import pandas as pd

res2 = []
for p in list(snakemake.input):
    mmodel = p.split("/")[4]
    model = p.split("/")[5]
    fold = int(p[-5:-4])
    with open(p) as f:
        res = list(chunked(f.readlines(),6))
        for idx, l in enumerate(res):
            fitness, mcc = l[2].rstrip().split(",")
            fitness = float(fitness.replace("Best Fitness: ",""))
            mcc = float(mcc.replace(" best metrics: {'mcc': ","").replace("}",""))
            res2.append([idx, fitness, mcc, fold, model, mmodel])

source = pd.DataFrame(res2,columns=["gen", "fitness", "mcc", "fold", "model", "mmodel"])

line = alt.Chart(source).mark_line(color="black").encode(
    x="gen:O",
    y="mean(fitness):Q"
)

band = alt.Chart(source).mark_errorband(extent="ci", color="black").encode(
    x=alt.X("gen:O", title=None, axis=alt.Axis(labelAngle=-35)),
    y=alt.Y("fitness:Q", title=None)
)

chart = (band + line).properties(
    width=100,
    height=100
).facet(
    column=alt.Column("model:N", title=None),
    row=alt.Row("mmodel:N", title=None),
    spacing=1
).configure_header(
    labelFontSize=14
).configure_axis(
    labelFontSize=12
)

chart.save(snakemake.output[0])  # html
chart.save(snakemake.output[1])  # png

Python Pandas altair more-itertools From line 1 of plots/gens_vs_perf.py

import pandas as pd
import altair as alt
import numpy as np

df_res = pd.read_csv(snakemake.input[0], index_col=0)
# df_res = pd.read_csv("data/temp/amp_antibp2/kappa_error_res/plot_data.csv", index_col=0)

x_min, y_min = df_res.loc[df_res.fold == 0].x.min(), df_res.loc[df_res.fold == 0].y.min()
x_max, y_max = df_res.loc[df_res.fold == 0].x.max(), df_res.loc[df_res.fold == 0].y.max()

df_res = df_res.loc[(df_res.x >= x_min) & (df_res.x <= x_max) & (df_res.y >= y_min) & (df_res.y <= y_max)]

# adopted from https://realpython.com/python-rounding/#rounding-up
def round_up(n, decimals=0):
    multiplier = 10 ** decimals
    return np.ceil(n * multiplier) / multiplier

# adopted from https://realpython.com/python-rounding/#rounding-down
def round_down(n, decimals=0):
    multiplier = 10 ** decimals
    return np.floor(n * multiplier) / multiplier


x_min = round_down(x_min, 1)
y_min = round_down(y_min, 1)

scatter = alt.Chart().mark_point(filled=True, opacity=1.0).encode(
    x=alt.X(
        "x:Q", title="kappa",
        scale=alt.Scale(domain=[x_min, x_max])
    ),
    y=alt.Y(
        "y:Q", title="average pair-wise error", axis=alt.Axis(grid=True),
        scale=alt.Scale(domain=[y_min, y_max])
    ),
    color=alt.Color(
        "cat:N", title="Pruning",
        scale=alt.Scale(
            domain=["all", "best", "chull", "mvo", "pfront", "rand"],
            range=["gray", "#fdae61", "#2c7bb6", "yellow", "#d7191c", "#abd9e9"]),
        legend=alt.Legend(orient="bottom", offset=12)
    ),
    size=alt.condition(
        alt.datum.cat == "all",
        alt.value(50),
        alt.value(100)
    ),
).properties(
    width=300,
    height=200
)

convex_hull = alt.Chart().mark_line(
    color="#2c7bb6",
    size=1.1
).encode(
    x=alt.X("x:Q", title=None),
    y=alt.Y("y:Q", title=None),
    order="chull:N",
).transform_filter(
    alt.datum.chull != -1
)

pareto_frontier = alt.Chart().mark_line(
    strokeDash=[5, 1],
    color="#d7191c",
    size=1.1
).encode(
    x="x:Q",
    y="y:Q",
    order="pfront:N"
).transform_filter(
    alt.datum.pfront != -1
)

vals = np.array(range(51)) / 100
vals = [e for e in vals if e <= y_max]
df = pd.DataFrame({"x": [1 - (1 / (1 - i)) for i in vals], "y": vals})
df = df.loc[(df.x >= x_min) & (df.y >= y_min)]

bound_line = alt.Chart(df).mark_line(color="gray", strokeDash=[4, 4]).encode(
    x=alt.X("x:Q"),
    y="y:Q"
)

c1 = alt.layer(
    convex_hull,
    pareto_frontier,
    scatter,
    bound_line,
    data=df_res.loc[df_res.fold == 0]
).facet(
    row=alt.Column("model", title=None),
    spacing=10
)

heatmap = alt.Chart().mark_rect().encode(
    x=alt.X(
        "x:Q",
        title=None,
        bin=alt.Bin(maxbins=40),
        axis=alt.Axis(values=[-1.0, -0.5, 0.0, 0.5, 1.0], format=".1f", grid=True),
        scale=alt.Scale(domain=[x_min, x_max])
    ),
    y=alt.Y(
        "y:Q",
        title=None,
        bin=alt.Bin(maxbins=40),
        axis=alt.Axis(
            values=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5], 
            format=".1f", 
            grid=True, domain=False, 
            ticks=False, labels=False
        ),
        scale=alt.Scale(domain=[y_min, y_max])
    ),
    color=alt.Color(
        "count(x):Q",
        title="Count",
        legend=alt.Legend(
            gradientLength=90,
            orient="bottom",
            offset=12
            # values=[0, 1500, 3000, 4500]
            # values=[0, np.histogram2d(x=df_res.x, y=df_res.y, bins=45)[0].max()]
        ),
        scale=alt.Scale(scheme="greys")
    ),
    tooltip="count(x):Q"
).properties(
    width=300,
    height=200
)

c2 = alt.layer(
    heatmap,
    bound_line,
    # data=df_res.loc[df_res.fold.isin([0, 1, 2, 3, 4, 5])].reset_index()
    data=df_res.reset_index()
).facet(
    row=alt.Row(
        "model:N",
        title=None,
        header=alt.Header(labels=False)
    ),
    spacing=10
)

resc = alt.hconcat(
    c1,
    c2,
    spacing=1
).resolve_scale(
    color="shared"
)

chart = resc.configure_header(
    labelFontSize=14
).configure_axis(
    labelFontSize=12
).configure_legend(
    gradientThickness=10,
    labelFontSize=13,
    columns=3
)

chart.save(snakemake.output[0])  # html

Python Pandas numpy altair From line 1 of plots/kappa_error.py

from xcd_plot import XCDChart

import pandas as pd
import altair as alt

import yaml

# with open("data/ensembles_res/cd.yaml") as f:
with open(snakemake.input[0]) as f:
    cd_data = yaml.safe_load(f)

from glob import glob

df_res = pd.DataFrame()
# for p in glob("data/ensembles_res/*/*/*.csv"):
for p in list(snakemake.input)[1:]:
    df_tmp = pd.read_csv(p, index_col=0)
    df_res = pd.concat([df_res, df_tmp])

df_res = df_res.loc[df_res.cat != "mvo"]

from pprint import pprint

s = sorted(zip(cd_data["models"]["average_ranking"], cd_data["models"]["names"]), key=lambda tup: tup[1])

# cd_data["models"]["average_ranking"] = [e[0] for e in s]
# cd_data["models"]["names"] = [e[1] for e in s]

xcd_chart = XCDChart(ensemble_data=df_res, cd_data=cd_data)
xcd_chart.save(snakemake.output[0])  # html
xcd_chart.save(snakemake.output[1])  # png

Python Pandas PyYAML altair From line 1 of plots/xcd.py

library(broom)

d <- read.csv(snakemake@input[[1]])
# d <- read.csv("data/temp/avp_amppred/kappa_error_res/plot_data.csv")

res.man <- manova(cbind(d$x, d$y) ~ model, data = d)

manova_summary <- summary(res.man)
write.csv(tidy(res.man), snakemake@output[[1]])

manova_summary_aov <- summary.aov(res.man)
write.csv(rbind(
  data.frame(
    manova_summary_aov[[1]],
    response=names(manova_summary_aov[1])
  ),
  data.frame(
    manova_summary_aov[[2]],
    response=names(manova_summary_aov[2])
  )
), snakemake@output[[2]])

df_res <- do.call(rbind, lapply(unique(d[["model"]]), function(m) {
  d_tmp <- d[d["model"] == m, ]
  d_tmp <- d_tmp[
    (d_tmp$ensemble_best == "False") &
    (d_tmp$ensemble_rand == "False") &
    (d_tmp$ensemble_chull == "False") &
    (d_tmp$ensemble_pfront == "False") &
    (d_tmp$chull_complete == -1)
  , ]
  # d_tmp <- d_tmp[sample(nrow(d_tmp), 1000), ]
  data.frame(kappa = d_tmp[ ,"x"], error = d_tmp[ ,"y"], model = m)
}))

anova_kappa_aov <- aov(df_res$kappa ~ df_res$model)
write.csv(tidy(anova_kappa_aov), snakemake@output[[3]])

anova_kappa_tukey_hsd <- TukeyHSD(aov(df_res$kappa ~ df_res$model))
write.csv(tidy(anova_kappa_tukey_hsd), snakemake@output[[4]])

anova_error_aov <- aov(df_res$error ~ df_res$model)
write.csv(tidy(anova_error_aov), snakemake@output[[5]])

anova_error_tukey_hsd <- TukeyHSD(aov(df_res$error ~ df_res$model))
write.csv(tidy(anova_error_tukey_hsd), snakemake@output[[6]])

### areas

df_res <- do.call(
  rbind,
  lapply(
    snakemake@input[2:5],
    # Sys.glob("data/temp/avp_amppred/areas/*/res.csv"),
    read.csv
  )
)

anova_area_aov <- aov(df_res$area ~ df_res$model)
write.csv(tidy(anova_area_aov), snakemake@output[[7]])

anova_area_tukey_hsd <- TukeyHSD(aov(df_res$area ~ df_res$model))
write.csv(tidy(anova_area_tukey_hsd), snakemake@output[[8]])

R broom From line 1 of scripts/statistics.R

run:
    dict_indcs = {}
    indcs = []
    for p in list(input):
        df = pd.read_csv(p, index_col=0)
        dict_indcs = dict_indcs | dict(df["y"])
        indcs += [list(df.index)]

    df_res = pd.DataFrame(dict_indcs.items(), columns=["idx", "y"])

    # get common all indices
    indcs = sorted(functools.reduce(set.intersection, indcs[1:], set(indcs[0])))
    df_res.loc[df_res.idx.isin(indcs)].to_csv(output[0])

SnakeMake From line 106 of main/Snakefile

run:
    df_indices = pd.read_csv(input[0], index_col=0)
    indices, y = df_indices["idx"].values, df_indices.y.values

    gss = StratifiedShuffleSplit(n_splits=len(FOLDS), train_size=.8, random_state=42)

    df_train, df_val, df_test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for train_idx, test_idx in gss.split(indices, y):
        val_indcs = test_idx[:math.ceil(len(test_idx) / 2)]
        test_indcs = test_idx[math.ceil(len(test_idx) / 2):]
        ser_indices_train = df_indices.iloc[train_idx, :]["idx"]
        df_train = pd.concat([
            df_train,
            ser_indices_train.reset_index(drop=True)
        ], axis=1)
        ser_indices_val = df_indices.iloc[val_indcs, :]["idx"]
        df_val = pd.concat([
            df_val,
            ser_indices_val.reset_index(drop=True)
        ],axis=1)
        ser_indices_test = df_indices.iloc[test_indcs, :]["idx"]
        df_test = pd.concat([
            df_test,
            ser_indices_test.reset_index(drop=True)
        ], axis=1)

    df_train.columns = [f"fold_{i}" for i in FOLDS]
    df_val.columns = [f"fold_{i}" for i in FOLDS]
    df_test.columns = [f"fold_{i}" for i in FOLDS]

    df_train.to_csv(output[0])
    df_val.to_csv(output[1])
    df_test.to_csv(output[2])

SnakeMake From line 128 of main/Snakefile

run:
    df_indices = pd.read_csv(input[0], index_col=0)
    indices = df_indices["idx"].values

    df = pd.read_csv(input[1], index_col=0).loc[indices, ]
    X, y = df.iloc[:, :-1].values, df["y"].values
    X_scaled = MinMaxScaler().fit_transform(X)

    vals = np.hstack((X_scaled, y.reshape((y.shape[0], 1))))

    indices = np.argwhere(pd.DataFrame(vals).std().values == 0).flatten()
    vals = np.delete(vals, indices, 1)

    if len(indices) != 0:
        print(wildcards.csv_name)

    df_res = pd.DataFrame(vals, columns=np.delete(df.columns, indices), index=df.index)
    df_res.to_csv(output[0])

SnakeMake From line 169 of main/Snakefile

run:
    df_indcs_train = pd.read_csv(input[1], index_col=0)
    indcs_train_tmp = df_indcs_train[f"fold_{wildcards.fold}"]
    df_indcs_val = pd.read_csv(input[2],index_col=0)
    indcs_val = df_indcs_val[f"fold_{wildcards.fold}"]
    indcs_train = pd.concat([indcs_train_tmp, indcs_val])

    df_indcs_test = pd.read_csv(input[3], index_col=0)
    indcs_test = df_indcs_test[f"fold_{wildcards.fold}"]

    df = pd.read_csv(input[0], index_col=0)
    X_train = df.iloc[:, :-1].loc[indcs_train, :].values
    y_train = df.loc[indcs_train, "y"].values
    X_test = df.iloc[:, :-1].loc[indcs_test, :].values
    y_test = df.loc[indcs_test, "y"].values

    clf = MODEL[wildcards.model]
    try:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        mcc = matthews_corrcoef(y_test, y_pred)
    except np.linalg.LinAlgError as e:
        print(e)
        mcc = 0.0
    except ValueError as e:
        print(e)
        mcc = 0.0

    pd.DataFrame({
        "mcc": [mcc],
        "fold": [wildcards.fold],
        "encoding": [wildcards.csv_name],
        "model": [wildcards.model]
    }).to_csv(output[0])

SnakeMake From line 196 of main/Snakefile

run:
    df_res = pd.DataFrame()
    for p in list(input):
        df_tmp = pd.read_csv(p, index_col=0)
        df_res = pd.concat([df_res, df_tmp])

    res = df_res.groupby("encoding")\
        .apply(lambda df: df.mcc.mean())\
        .sort_values(ascending=False)

    top3 = res.index[:3].to_list()

    df_res = df_res.loc[df_res.encoding.isin(top3)]
    df_res["rank"] = -1
    df_res["cat"] = "single"

    for i, enc in enumerate(top3, start=1):
        df_res.loc[df_res.encoding == enc, "rank"] = f"Top_{i}"

    df_res.to_csv(output[0])

SnakeMake From line 238 of main/Snakefile

run:
    df_indcs_train = pd.read_csv(input[0], index_col=0)
    indcs_train = df_indcs_train[f"fold_{wildcards.fold}"]

    df_indcs_val = pd.read_csv(input[1],index_col=0)
    indcs_val = df_indcs_val[f"fold_{wildcards.fold}"]

    paths = list(input[2:])

    encoded_datasets = [pd.read_csv(p, index_col=0) for p in paths]

    X_train_list = \
        [df.loc[indcs_train, :].iloc[:, :-1].values
         for df in encoded_datasets]

    X_val_list = \
        [df.loc[indcs_val, :].iloc[:, :-1].values
         for df in encoded_datasets]


    y_train, y_val = \
        encoded_datasets[0].loc[indcs_train, "y"].values, \
        encoded_datasets[0].loc[indcs_val, "y"].values

    clf = MODEL[wildcards.model]
    eclf = META_MODEL[META_MODELS[0]]
    eclf.estimators = [(paths[i], clf) for i in range(len(paths))]
    eclf.fit(X_train_list, y_train)

    res = []
    for ((e1, clf_1), X_val_1), ((e2, clf_2), X_val_2) in \
            itertools.combinations(zip(eclf.estimators_, X_val_list), 2):
        y_pred_tree_1, y_pred_tree_2 = \
            clf_1.predict(X_val_1), clf_2.predict(X_val_2)
        error_1, error_2 = \
            1 - accuracy_score(y_pred_tree_1, y_val), \
            1 - accuracy_score(y_pred_tree_2, y_val)
        mean_pairwise_error = np.mean([error_1, error_2])
        k = kappa(y_pred_tree_1,y_pred_tree_2)
        res += [[k, mean_pairwise_error, e1, e2]]

    df_res = pd.DataFrame(res, columns=["x", "y", "encoding_1", "encoding_2"])
    df_res["model"] = wildcards.model

    df_res.to_csv(output[0])

SnakeMake From line 291 of main/Snakefile

run:
    df_points = pd.read_csv(input[0], index_col=0)

    hull = ConvexHull(df_points[["x", "y"]])

    df_points["chull_complete"] = -1
    df_points.iloc[hull.vertices, df_points.columns.get_loc("chull_complete")] = \
        range(hull.vertices.shape[0])

    df_points.to_csv(output[0])

    pd.DataFrame({
        "model": [wildcards.model],
        "area": [hull.area],
        "fold": [wildcards.fold]
    }).to_csv(output[1])

SnakeMake From line 343 of main/Snakefile

run:
    df_res = pd.DataFrame()
    for p in list(input):
        df_res = pd.concat([df_res, pd.read_csv(p, index_col=0)])

    df_res.to_csv(output[0])

SnakeMake From line 367 of main/Snakefile

run:
    df_points = pd.read_csv(input[0], index_col=0)

    df_hull = df_points.loc[df_points.chull_complete != -1, ["x", "y"]]

    # mask convex hull (use only vals towards lower, left corner)
    P = pareto_n(-df_hull.values)

    indices = list(df_hull.iloc[P[0], :].sort_values("x").index)

    df_points["chull"] = -1
    df_points.iloc[indices, df_points.columns.get_loc("chull")] = range(len(indices))

    df_points.to_csv(output[0])

SnakeMake From line 407 of main/Snakefile

run:
    df_points = pd.read_csv(input[0], index_col=0)

    P = pareto_n(-df_points[["x", "y"]].values)

    indices = list(df_points.iloc[P[0], :].sort_values("x").index)

    df_points["pfront"] = -1
    df_points.iloc[indices, df_points.columns.get_loc("pfront")] = range(len(indices))

    df_points.to_csv(output[0])

SnakeMake From line 427 of main/Snakefile

run:
    df_indcs_train = pd.read_csv(input[0],index_col=0)
    indcs_train_tmp = df_indcs_train[f"fold_{wildcards.fold}"]
    df_indcs_val = pd.read_csv(input[1],index_col=0)
    indcs_val = df_indcs_val[f"fold_{wildcards.fold}"]
    indcs_train = pd.concat([indcs_train_tmp, indcs_val])

    df_indcs_test = pd.read_csv(input[2],index_col=0)
    indcs_test = df_indcs_test[f"fold_{wildcards.fold}"]

    df_points = pd.read_csv(input[3], index_col=0)

    # y is average pairwise error
    train_paths = list(set(
        df_points\
            .sort_values("y").iloc[:15, :][["encoding_1", "encoding_2"]]\
            .values.flatten()
    ))

    # keep ensemble best encodings position for later usage
    indices = df_points.sort_values("y").iloc[:15, :].index
    df_points["ensemble_best"] = False
    df_points.iloc[indices, df_points.columns.get_loc("ensemble_best")] = True

    encoded_datasets = [pd.read_csv(p, index_col=0) for p in train_paths]

    X_train_list, X_test_list = \
        [df.loc[indcs_train, :].iloc[:, :-1].values
         for df in encoded_datasets], \
        [df.loc[indcs_test, :].iloc[:, :-1].values
         for df in encoded_datasets]

    y_train, y_test = \
        encoded_datasets[0].loc[indcs_train, "y"].values, \
        encoded_datasets[0].loc[indcs_test, "y"].values

    clf = MODEL[wildcards.model]
    eclf = META_MODEL[wildcards.meta_model]
    eclf.estimators = [(train_paths[i], clf) for i in range(len(train_paths))]

    try:
        eclf.fit(X_train_list, y_train)
        y_pred = eclf.predict(X_test_list)
        mcc = matthews_corrcoef(y_test,y_pred)
    except np.linalg.LinAlgError as e:
        print(e)
    except ValueError as e:
        print(e)

    pd.DataFrame({
        "mcc": [mcc],
        "fold": [wildcards.fold],
        "model": [wildcards.model],
        "meta_model": [wildcards.meta_model]
    }).to_csv(output[0])

    df_points.to_csv(output[1])

SnakeMake From line 448 of main/Snakefile

run:
    df_points = pd.read_csv(input[0], index_col=0)

    idcs = default_rng().choice(df_points.index, size=15, replace=False)

    pd.DataFrame(idcs, columns=["enc_index"]).to_csv(output[0])

SnakeMake From line 511 of main/Snakefile

run:
    df_indcs_train = pd.read_csv(input[0], index_col=0)
    indcs_train_tmp = df_indcs_train[f"fold_{wildcards.fold}"]
    df_indcs_val = pd.read_csv(input[1],index_col=0)
    indcs_val = df_indcs_val[f"fold_{wildcards.fold}"]
    indcs_train = pd.concat([indcs_train_tmp, indcs_val])

    df_indcs_test = pd.read_csv(input[2], index_col=0)
    indcs_test = df_indcs_test[f"fold_{wildcards.fold}"]

    df_points = pd.read_csv(input[3], index_col=0)

    idcs = pd.read_csv(input[4], index_col=0)["enc_index"]
    train_paths = list(set(
        df_points.iloc[idcs, :][["encoding_1", "encoding_2"]].values.flatten()
    ))

    # keep ensemble best encodings position for later usage
    df_points["ensemble_rand"] = False
    df_points.iloc[idcs, df_points.columns.get_loc("ensemble_rand")] = True

    encoded_datasets = [pd.read_csv(p, index_col=0) for p in train_paths]

    X_train_list, X_test_list = \
        [df.loc[indcs_train, :].iloc[:, :-1].values
         for df in encoded_datasets], \
        [df.loc[indcs_test, :].iloc[:, :-1].values
         for df in encoded_datasets]

    y_train, y_test = \
        encoded_datasets[0].loc[indcs_train, "y"].values, \
        encoded_datasets[0].loc[indcs_test, "y"].values

    clf = MODEL[wildcards.model]
    eclf = META_MODEL[wildcards.meta_model]
    eclf.estimators = [(train_paths[i], clf) for i in range(len(train_paths))]

    try:
        eclf.fit(X_train_list,y_train)
        y_pred = eclf.predict(X_test_list)
        mcc = matthews_corrcoef(y_test,y_pred)
    except np.linalg.LinAlgError as e:
        print(e)
    except ValueError as e:
        print(e)

    pd.DataFrame({
        "mcc": [mcc],
        "fold": [wildcards.fold],
        "model": [wildcards.model],
        "meta_model": [wildcards.meta_model]
    }).to_csv(output[0])

    df_points.to_csv(output[1])

SnakeMake From line 528 of main/Snakefile

run:
    df_indcs_train = pd.read_csv(input[0], index_col=0)
    indcs_train_tmp = df_indcs_train[f"fold_{wildcards.fold}"]
    df_indcs_val = pd.read_csv(input[1],index_col=0)
    indcs_val = df_indcs_val[f"fold_{wildcards.fold}"]
    indcs_train = pd.concat([indcs_train_tmp, indcs_val])

    df_indcs_test = pd.read_csv(input[2], index_col=0)
    indcs_test = df_indcs_test[f"fold_{wildcards.fold}"]

    df_points = pd.read_csv(input[3], index_col=0)

    train_paths = list(set(
        df_points.loc[df_points.chull != -1][["encoding_1", "encoding_2"]]\
            .values.flatten()
    ))

    # keep ensemble best encodings position for later usage
    indices = df_points.loc[df_points.chull != -1].index
    df_points["ensemble_chull"] = False
    df_points.iloc[indices, df_points.columns.get_loc("ensemble_chull")] = True

    encoded_datasets = [pd.read_csv(p, index_col=0) for p in train_paths]

    X_train_list, X_test_list = \
        [df.loc[indcs_train, :].iloc[:, :-1].values
         for df in encoded_datasets], \
        [df.loc[indcs_test, :].iloc[:, :-1].values
         for df in encoded_datasets]

    y_train, y_test = \
        encoded_datasets[0].loc[indcs_train, "y"].values, \
        encoded_datasets[0].loc[indcs_test, "y"].values

    clf = MODEL[wildcards.model]
    eclf = META_MODEL[wildcards.meta_model]
    eclf.estimators = [(train_paths[i], clf) for i in range(len(train_paths))]

    try:
        eclf.fit(X_train_list,y_train)
        y_pred = eclf.predict(X_test_list)
        mcc = matthews_corrcoef(y_test,y_pred)
    except np.linalg.LinAlgError as e:
        print(e)
    except ValueError as e:
        print(e)

    pd.DataFrame({
        "mcc": [mcc],
        "fold": [wildcards.fold],
        "model": [wildcards.model],
        "meta_model": [wildcards.meta_model]
    }).to_csv(output[0])

    df_points.to_csv(output[1])

SnakeMake From line 592 of main/Snakefile

run:
    df_indcs_train = pd.read_csv(input[0], index_col=0)
    indcs_train_tmp = df_indcs_train[f"fold_{wildcards.fold}"]
    df_indcs_val = pd.read_csv(input[1],index_col=0)
    indcs_val = df_indcs_val[f"fold_{wildcards.fold}"]
    indcs_train = pd.concat([indcs_train_tmp, indcs_val])

    df_indcs_test = pd.read_csv(input[2], index_col=0)
    indcs_test = df_indcs_test[f"fold_{wildcards.fold}"]

    df_points = pd.read_csv(input[3], index_col=0)

    train_paths = list(set(
        df_points.loc[df_points.pfront != -1][["encoding_1", "encoding_2"]] \
            .values.flatten()
    ))

    # keep ensemble best encodings position for later usage
    indices = df_points.loc[df_points.pfront != -1].index
    df_points["ensemble_pfront"] = False
    df_points.iloc[indices, df_points.columns.get_loc("ensemble_pfront")] = True

    encoded_datasets = [pd.read_csv(p, index_col=0) for p in train_paths]

    X_train_list, X_test_list = \
        [df.loc[indcs_train, :].iloc[:, :-1].values
         for df in encoded_datasets], \
        [df.loc[indcs_test, :].iloc[:, :-1].values
         for df in encoded_datasets]

    y_train, y_test = \
        encoded_datasets[0].loc[indcs_train, "y"].values, \
        encoded_datasets[0].loc[indcs_test, "y"].values

    clf = MODEL[wildcards.model]
    eclf = META_MODEL[wildcards.meta_model]
    eclf.estimators = [(train_paths[i], clf) for i in range(len(train_paths))]

    try:
        eclf.fit(X_train_list,y_train)
        y_pred = eclf.predict(X_test_list)
        mcc = matthews_corrcoef(y_test,y_pred)
    except np.linalg.LinAlgError as e:
        print(e)
    except ValueError as e:
        print(e)

    pd.DataFrame({
        "mcc": [mcc],
        "fold": [wildcards.fold],
        "model": [wildcards.model],
        "meta_model": [wildcards.meta_model]
    }).to_csv(output[0])

    df_points.to_csv(output[1])

SnakeMake From line 657 of main/Snakefile

run:
    # use complete for MVO inner cv
    df_indcs_train = pd.read_csv(input[0], index_col=0)
    indcs_train_tmp = df_indcs_train[f"fold_{wildcards.fold}"]
    df_indcs_val = pd.read_csv(input[1],index_col=0)
    indcs_val = df_indcs_val[f"fold_{wildcards.fold}"]
    indcs_train = pd.concat([indcs_train_tmp, indcs_val])

    # use for testing after optimization
    df_indcs_test = pd.read_csv(input[2], index_col=0)
    indcs_test = df_indcs_test[f"fold_{wildcards.fold}"]

    df_points = pd.read_csv(input[3], index_col=0)

    # y is average pairwise error
    train_paths = list(set(
        df_points[["encoding_1", "encoding_2"]] \
            .values.flatten()
    ))

    n_universes = 32
    max_generations = 15

    p_0 = 6 / len(train_paths)
    mvo = BinaryMVO(
        n_universes=n_universes,
        d=len(train_paths),
        f=ff.train_ensemble,
        f_args={
            "paths_to_encoded_datasets": train_paths,
            "train_index": indcs_train,
            "base_clf": MODEL[wildcards.model],
            "meta_clf": META_MODEL[wildcards.meta_model]
        },
        p=[p_0, 1 - p_0],
        funker_name=None,
        new_random_state_each_generation=False,
        n_jobs=n_universes,
        log_path=os.path.dirname(output[2]) + "/",
        log_file_name=os.path.basename(output[2])
    )

    best_solution, _ = mvo.run(0, max_iterations=max_generations, parallel=True)

    train_paths_best = np.array(train_paths)[np.nonzero(best_solution)[0]]

    # keep ensemble best encodings position for later usage
    indices = df_points.loc[
        df_points.encoding_1.isin(train_paths_best) &
        df_points.encoding_2.isin(train_paths_best)
    ].index
    df_points["ensemble_mvo"] = False
    df_points.iloc[indices, df_points.columns.get_loc("ensemble_mvo")] = True

    encoded_datasets = [pd.read_csv(p, index_col=0) for p in train_paths_best]

    X_train_list, X_test_list = \
        [df.loc[indcs_train, :].iloc[:, :-1].values
         for df in encoded_datasets], \
        [df.loc[indcs_test, :].iloc[:, :-1].values
         for df in encoded_datasets]

    y_train, y_test = \
        encoded_datasets[0].loc[indcs_train, "y"].values, \
        encoded_datasets[0].loc[indcs_test, "y"].values

    clf = MODEL[wildcards.model]
    eclf = META_MODEL[wildcards.meta_model]
    eclf.estimators = [(train_paths[i], clf) for i in range(len(train_paths_best))]

    try:
        eclf.fit(X_train_list, y_train)
        y_pred = eclf.predict(X_test_list)
        mcc = matthews_corrcoef(y_test,y_pred)
    except np.linalg.LinAlgError as e:
        print(e)
    except ValueError as e:
        print(e)

    pd.DataFrame({
        "mcc": [mcc],
        "fold": [wildcards.fold],
        "model": [wildcards.model],
        "meta_model": [wildcards.meta_model]
    }).to_csv(output[0])

    df_points.to_csv(output[1])

SnakeMake From line 725 of main/Snakefile

run:
    combine_point_data(list(input), output[0])

SnakeMake From line 833 of main/Snakefile

run:
    combine_point_data(list(input), output[0])

SnakeMake From line 844 of main/Snakefile

run:
    df_res = pd.DataFrame()
    for k, path_obj in input.items():
        if type(path_obj) == snakemake.io.Namedlist:
            for p in path_obj:
                df_tmp = pd.read_csv(p, index_col=0)
                df_tmp["cat"] = k
                df_res = pd.concat([df_res, df_tmp])
        else:
            df_tmp = pd.read_csv(path_obj, index_col=0)
            df_tmp["meta_model"] = wildcards.meta_model
            df_tmp["cat"] = k
            df_tmp.drop("encoding",axis=1,inplace=True)
            df_res = pd.concat([df_res, df_tmp])

    df_res.to_csv(output[0])

SnakeMake From line 871 of main/Snakefile

run:
    df_res = pd.DataFrame()
    for  p in list(input):
        df_tmp = pd.read_csv(p, index_col=0)
        df_res = pd.concat([df_res, df_tmp])

    df_res.reset_index(drop=True).to_csv(output[0])

SnakeMake From line 895 of main/Snakefile

script:
    "scripts/cd.R"

SnakeMake From line 911 of main/Snakefile

run:
    # we use only one ensemble method here, because it does not influence the kappa-error values
    df_res = pd.DataFrame()
    for m in MODELS:
        for f in FOLDS:
            path = [p for p in list(input) if f"/{m}/" in p and f"/{f}.csv" in p][0]
            df_tmp = pd.read_csv(path, index_col=0)
            filter_vals = [
                df_tmp.ensemble_best,
                df_tmp.ensemble_rand,
                df_tmp.ensemble_chull,
                df_tmp.ensemble_pfront
            ]
            if "ensemble_mvo" in df_tmp.columns:
                filter_vals.append(df_tmp.ensemble_mvo)
            filter_ = reduce(
                lambda v1, v2: v1 | v2,
                filter_vals[:-1],
                filter_vals[-1]
            )
            df_tmp1 = df_tmp \
                .loc[np.bitwise_not(filter_) & (df_tmp.chull_complete == -1)] \
                .sample(1000).copy()
            df_tmp2 = df_tmp \
                .loc[filter_ | (df_tmp.chull_complete != -1)].copy()
            df_tmp = pd.concat([df_tmp1, df_tmp2])
            df_tmp["model"], df_tmp["fold"] = m, f
            df_res = pd.concat([df_res, df_tmp])

    df_res.loc[df_res.ensemble_mvo.isna(), "ensemble_mvo"] = False

    df_res["cat"] = df_res.apply(
        lambda row:
            "mvo" if row.ensemble_mvo else
            "chull" if row.ensemble_chull else
            "pfront" if row.ensemble_pfront else
            "best" if row.ensemble_best else
            "rand" if row.ensemble_rand else
            "all"
        , axis=1)

    df_res.to_csv(output[0])

SnakeMake From line 921 of main/Snakefile

script:
    "scripts/plots/kappa_error.py"

SnakeMake From line 969 of main/Snakefile

script:
    "scripts/plots/gens_vs_perf.py"

SnakeMake From line 980 of main/Snakefile

script:
    "scripts/plots/box_plot.py"

SnakeMake From line 994 of main/Snakefile

script:
    "scripts/plots/xcd.py"

SnakeMake From line 1006 of main/Snakefile

script:
    "scripts/plots/box_plot_manova.py"

SnakeMake From line 1014 of main/Snakefile

script:
    "scripts/statistics.R"

SnakeMake From line 1031 of main/Snakefile

run:
    with open(output[0], "w") as f:
        for p in sorted(input):
            df_tmp = pd.read_csv(
                p,index_col=0,
                converters={"term": lambda v: v.replace("df_res$","")}
            )
            df_tmp.fillna("-",inplace=True)
            exp = p.split("/")[-1][:-4]
            df_tmp["experiment"] = exp
            f.write(f"<h4>{exp}</h4>\n{df_tmp.to_html(col_space='70px')}\n")

SnakeMake From line 1046 of main/Snakefile

run:
    paths = [p.split("/")[-1].replace(".csv", "") for p in list(input)]
    paths = sorted(set(paths))
    arr = ["electrostatic_hull", "dist_freq"]
    paths = [[p[:18]] + p[18:].split("_")
             if "hull" in p else [p[:9]] + p[10:].split("_")
             if "dist_freq" in p else p.split("_") for p in paths]
    paths = [{k: v for k,v in zip(range(len(p)), p)} for p in paths]

    df = pd.DataFrame(paths)
    df.columns = ["param_" + str(i) for i in range(5)]


    def calc(vals):
        return "; ".join([str(v) for v in set(vals)])


    df = df.groupby("param_0").apply(lambda df: pd.DataFrame({
        "params_1": [calc(df["param_1"].values)],
        "params_2": [calc(df["param_2"].values)],
        "params_3": [calc(df["param_3"].values)],
        "params_4": [calc(df["param_4"].values)],
        #"params_5": [calc(df["param_5"].values)],
    })).reset_index(drop=False)

    df = df.drop(["level_1"], axis=1)

    cols = list(df.columns)
    cols[0] = "encoding"
    df.columns = cols

    df = df.replace("nan", "")

    df.to_csv(output[0], sep=",", index_label=False)

SnakeMake From line 1065 of main/Snakefile

shell:
    "tar czf {output[0]} {input}"

SnakeMake From line 1111 of main/Snakefile

run:
    from scipy.stats import ttest_rel

    def get_table(df_res):

        df_stats = df_res \
            .groupby(["dataset", "model", "cat", "meta_model"])["mcc"] \
            .describe().reset_index() \
            .loc[:, ['dataset', 'model', 'cat', 'meta_model', 'mean', 'std']]

        df_final = df_stats \
            .groupby(["dataset", "cat"]) \
            .apply(lambda df: df.sort_values("mean", ascending=False).iloc[0, :]) \
            .reset_index(drop=True)

        df_final["anno"] = df_final[["mean", "std"]].apply(
            lambda row: f"{np.round(row[0], 2)} (±{np.round(row[1], 2)})",
            axis=1
        )

        df_out = df_final.pivot(index="dataset", columns="cat", values="anno")

        for ds in df_final.dataset.unique():

            best_ens_cat, best_ens_mm, best_ens_m = df_final\
                .loc[df_final.dataset == ds]\
                .sort_values("mean", ascending=False)[["cat", "meta_model", "model"]]\
                .iloc[0]

            single_best_mm, single_best_m = df_final\
                .loc[(df_final.dataset == ds) & (df_final.cat == "single")]\
                .sort_values("mean",ascending=False)[["meta_model", "model"]] \
                .iloc[0]

            a1 = df_res.loc[
                (df_res.dataset == ds) &
                (df_res.cat == best_ens_cat) &
                (df_res.meta_model == best_ens_mm) &
                (df_res.model == best_ens_m)
            , "mcc"].values

            a2 = df_res.loc[
                (df_res.dataset == ds) &
                (df_res.cat == "single") &
                (df_res.meta_model == single_best_mm) &
                (df_res.model == single_best_m)
            , "mcc"
            ].values[:len(a1)]  # in case MVO is best method

            _, pval = ttest_rel(a1, a2, alternative="greater")

            # 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
            if pval == 0:
                sig = "***"
            elif pval < 0.001:
                sig = "**"
            elif pval < 0.01:
                sig = "*"
            elif pval < 0.05:
                sig = "."
            else:
                sig = "ns"

            df_out.loc[ds, best_ens_cat] = \
                df_out.loc[ds, best_ens_cat].replace(" (",f"<sup>{sig}</sup> (")

            df_out.columns.name = None
            df_out.index.name = None

        return df_out.to_html(escape=False)

    df_res = pd.DataFrame()

    for p in list(input.ensembles_res):
        df_tmp = pd.read_csv(p,index_col=0)
        dataset = p.split("/")[2]
        df_tmp["dataset"] = dataset
        df_res = pd.concat([df_res, df_tmp])

    for p in list(input.single_encodings_res):
        df_tmp = pd.read_csv(p,index_col=0)
        dataset = p.split("/")[2]
        df_tmp["dataset"] = dataset
        df_tmp = df_tmp.loc[df_tmp["rank"] == "Top_1", :]
        df_tmp = df_tmp.drop(["encoding"],axis=1)
        df_tmp = df_tmp.rename(columns={"rank": "meta_model"})
        df_res = pd.concat([df_res, df_tmp])

    t1 = get_table(df_res)

    df_res = df_res.loc[df_res.model != "rf"]
    t2 = get_table(df_res)

    with open(output[0], "w") as f:
        h1 = "<h3>With RF</h3>"
        h2 = "<h3>Without RF</h3>"
        f.write(f"{h1}\n{t1}\n{h2}\n{t2}\n")
        f.flush()

SnakeMake scipy From line 1124 of main/Snakefile

run:
    df_stats = pd.DataFrame()
    for p in list(input):
        df_tmp = pd.read_csv(p, index_col=0)
        df_res = df_tmp.area.describe()[["mean", "std"]]
        df_res["dataset"] = p.split("/")[2]
        df_res["model"] = p.split("/")[-2]
        df_stats = pd.concat([
            df_stats,
            df_res.to_frame().transpose()
        ])

    df_stats.reset_index(drop=True, inplace=True)

    df_stats["anno"] = df_stats[["mean", "std"]].apply(
        lambda row: f"{np.round(row[0], 2)} (±{np.round(row[1], 3)})",
        axis=1
    )

    df_out = df_stats.pivot(index="dataset", columns="model", values="anno")
    df_out.columns.name = None
    df_out.index.name = None

    with open(output[0], "w") as f:
        f.write(f"{df_out.to_html()}\n")
        f.flush()