Snakemake Workflow for Quantum Definition of Molecular Structure Analysis

public public 1yr ago 0 bookmarks
Loading...

This is the Snakemake workflow for reproducing the data analysis of our paper on a quantum definition of molecular structure .

In order to execute the workflow, you need to have

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import sys

sample_name = sys.argv[1]
medoid_index_name = sys.argv[2]
clusterinterval = int(sys.argv[3])
outfile = sys.argv[4]

bandwidth = 0.1  # KDE bandwidth
d = 0.1   # spacing between voxels
xmin = ymin = -2
xmax = ymax = 2
zmin = -1.5
zmax = 1.5
Nx = int((xmax-xmin)/d + 1)
Ny = int((ymax-ymin)/d + 1)
Nz = int((zmax-zmin)/d + 1)

coords = np.load(sample_name)
nselection = coords.shape[0]   # select all

Nel = 2
eltot_xyz = np.empty((Nel*nselection, 3))    # KDE estimate for ANY electron positions (combining e1 and e2)

for i in range(nselection):
    eltot_xyz[i, :] = coords[i, :3, 3]
    eltot_xyz[nselection+i, :] = coords[i, :3, 4]

xgrid = np.linspace(xmin, xmax, Nx)
ygrid = np.linspace(ymin, ymax, Ny)
zgrid = np.linspace(zmin, zmax, Nz)
X,Y,Z = np.meshgrid(xgrid, ygrid, zgrid, indexing='ij')
XYZ_values = np.vstack((X.flatten(), Y.flatten(), Z.flatten())).transpose()
XYZ_values_parts = []
Nparts = Nx
size_part = Nx*Ny*Nz//Nparts
for i in range(Nparts):
    XYZ_values_parts.append(XYZ_values[i*size_part:i*size_part+size_part, :])

print("Doing KDE fit ...")
kde_electrons = KernelDensity(bandwidth=bandwidth)
kde_electrons.fit(eltot_xyz)
density_electrons_parts = []
print("Evaluating KDE on the grid ...")
for i in tqdm(range(Nparts)):
    log_density_electrons = kde_electrons.score_samples(XYZ_values_parts[i])
    density_electrons_parts.append(Nel*np.exp(log_density_electrons))    # KDE density is normalized to 1, so have to multiply with number of electrons
density_electrons = np.hstack(density_electrons_parts)

medoid_index = np.load(medoid_index_name)[0]
medoid_index = clusterinterval*(medoid_index+1)   # to index ALL samples
coords_medoid = coords[medoid_index,:,:]

### Write cube file
natoms = 3
Z_H = 1    # nuclear charge of hydrogen
print("Writing cube file ...")
with open(outfile, "w") as f:
    f.write("KDE electron density extracted from random samples drawn from the non-BO joint probability density\n\n")
    f.write("{} {} {} {}\n".format(natoms, xmin, ymin, zmin))
    f.write("{} {} {} {}\n".format(Nx, d, 0.0, 0.0))
    f.write("{} {} {} {}\n".format(Ny, 0.0, d, 0.0))
    f.write("{} {} {} {}\n".format(Nz, 0.0, 0.0, d))
    # write medoid positions:
    f.write("{} {} {} {} {}\n".format(Z_H, 0.0, coords_medoid[0, 0], coords_medoid[1, 0], coords_medoid[2, 0]))
    f.write("{} {} {} {} {}\n".format(Z_H, 0.0, coords_medoid[0, 1], coords_medoid[1, 1], coords_medoid[2, 1]))
    f.write("{} {} {} {} {}\n".format(Z_H, 0.0, coords_medoid[0, 2], coords_medoid[1, 2], coords_medoid[2, 2]))
    for i in tqdm(range(Nx*Ny*Nz)):
        f.write("{}\n".format(density_electrons[i]))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import matplotlib
import numpy as np
from mpl_toolkits.axes_grid1 import make_axes_locatable
from matplotlib.gridspec import GridSpec
import sys

samples_aligned_name = sys.argv[1]
medoid_index_name = sys.argv[2]
nselection_clustering = int(sys.argv[3])
nselection_samples = int(sys.argv[4])
outfile_name = sys.argv[5]

bandwidth = 0.05
xmin = ymin = -1.5
xmax = ymax = 1.5
Nx = Ny = 50

mycmap_blue = LinearSegmentedColormap.from_list('mycmap_blue', [(0.12156862745098039, 0.4666666666666667, 0.7058823529411765,0), (0.12156862745098039, 0.4666666666666667, 0.7058823529411765,1)])    # interpolate alpha value from 0 to 1 for tab:blue

coords_all = np.load(samples_aligned_name)
nsteps = coords_all.shape[0]

interval_clustering = nsteps//nselection_clustering
D1_xy = np.empty((3*nselection_clustering, 2))          # only x,y since z coordinate is always very close to zero and therefore not interesting.

for i in range(nselection_clustering):
    index = interval_clustering*(i+1) - 1
    D1_xy[i, :] = coords_all[index, :2, 0]
    D1_xy[nselection_clustering+i, :] = coords_all[index, :2, 1]
    D1_xy[2*nselection_clustering+i, :] = coords_all[index, :2, 2]

xgrid = np.linspace(xmin, xmax, Nx)
ygrid = np.linspace(ymin, ymax, Ny)
X,Y = np.meshgrid(xgrid, ygrid)
XY_values = np.vstack((X.flatten(), Y.flatten())).transpose()

kde_D1 = KernelDensity(bandwidth=bandwidth)
kde_D1.fit(D1_xy)
log_density_D1 = kde_D1.score_samples(XY_values)
density_D1 = np.exp(log_density_D1)
Z_D1 = 3*np.flip(density_D1.reshape((Nx, Ny)), axis=0)     # flip reverses the rows. This is important for plotting with imshow. times 3 such that it is normalized to number of deuterons.

fig = plt.figure(figsize=(4, 2))
gs = GridSpec(1, 2, width_ratios=[4,5], height_ratios=[1])    # choose right axis 20% larger such that colorbar also fits in (otherwise the plot will shrink to make space for the colorbar).
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])

# Left plot: medoid structure and six samples
interval_samples = nsteps//nselection_samples
Nnuc = 3    # we have three deuterons
colors_samples = ['tab:red', 'tab:green', 'tab:blue', 'tab:olive', 'tab:purple', 'tab:orange']
medoid_index = np.load(medoid_index_name)[0]
medoid_coords = coords_all[interval_clustering*(medoid_index+1) - 1]
for i in range(nselection_samples):
    index = interval_samples*(i+1) - 1
    for j in range(Nnuc):    # plot deuterons
        ax1.scatter(coords_all[index, 0,j], coords_all[index, 1,j], s=10, c=colors_samples[i])
for j in range(Nnuc):    # plot deuterons
    ax1.scatter(medoid_coords[0,j], medoid_coords[1,j], s=10, c='k')

ax1.set_xlim([xmin,xmax])
ax1.set_ylim([ymin,ymax])
ax1.set_xlabel("x / Bohrs")
ax1.set_ylabel("y / Bohrs")
ax1.set(adjustable='box', aspect='equal')

# Right plot: Heatplot of KDE
fig_D1 = ax2.imshow(Z_D1, extent = (xmin, xmax, ymin, ymax), cmap = mycmap_blue, interpolation = 'bicubic')
ax2.set_xlabel("x / Bohrs")
ax2.set_ylabel("y / Bohrs")
divider = make_axes_locatable(ax2)
cax = divider.append_axes("right", size="10%", pad="15%")
plt.colorbar(fig_D1, cax = cax)
plt.tight_layout()
plt.savefig(outfile_name, dpi=300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
import sys
import numpy as np
import matplotlib.pyplot as plt

rhofile = sys.argv[1]
nsteps = int(sys.argv[2])
outfile = sys.argv[3]

rhovalues = np.loadtxt(rhofile)

median = np.median(rhovalues)

plt.plot([1,nsteps], [median, median], color='tab:orange') 
plt.plot(rhovalues, color='tab:blue')                     
plt.xlabel("Step")
plt.ylabel(r"$\rho$ / a.u.")
plt.xlim([1,nsteps])
plt.ylim([1e-10, 1e-3])
plt.yscale('log')
plt.savefig(outfile, dpi=300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import numpy as np
import matplotlib.pyplot as plt
from decimal import Decimal
import matplotlib
import sys

outfolder = sys.argv[1]

matplotlib.rc('font', size=8)

def mantissa_exp(x):
    (sign, digits, exponent) = Decimal(x).as_tuple()  # this is the exponent required to move the decimal point from after the last digit to the correct position
    exponent = len(digits) + exponent - 1
    mantissa = round(x/(10**exponent), ndigits=1)
    return mantissa, exponent

def nicestring(x):
    mantissa, exponent = mantissa_exp(x)
    mantissa = str(mantissa)
    exponent = str(exponent)
    return r"$\rho = "+mantissa+r"\times"+"10^{"+exponent+"}$"

saved_rho = np.loadtxt(f"{outfolder}/saved_rho_selected")

coords = []
for i in range(1,6+1):
    coords.append(np.loadtxt(f"{outfolder}/sample{i}_planebasis"))

fig, axes = plt.subplots(2, 3, figsize=(4, 3), sharex=True, sharey=True)    # create 2 rows with 3 plots each

size_deuteron = 80
size_electron = 20
value_range = [-2, 2]

counter = 0
for row in range(2):
    for col in range(3):
        axes[row,col].set_xlim(value_range)
        axes[row,col].set_ylim(value_range)
        if (row==1):
            axes[row,col].set_xlabel("x / Bohrs")
        if (col==0):
            axes[row,col].set_ylabel("y / Bohrs")
        axes[row,col].set(adjustable='box', aspect='equal')    # this ensures that the subplots are square
        axes[row,col].set_xticks([-2,0,2])
        for i in range(3):    # plot deuterons
            axes[row,col].scatter(coords[counter][0,i], coords[counter][1,i], s=size_deuteron, c='tab:blue')
        for i in range(3,5):  # plot electrons
            axes[row,col].scatter(coords[counter][0,i], coords[counter][1,i], s=size_electron, c='tab:red')
        axes[row,col].text(0, -1.5, str(nicestring(saved_rho[counter])), horizontalalignment='center', verticalalignment='center')
        counter += 1

plt.savefig(f"{outfolder}/D3plus_randomsamples.png", dpi=300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import numpy as np
import sys
import matplotlib.pyplot as plt
from decimal import Decimal
import matplotlib

startconfig_file = sys.argv[1]
outfile = sys.argv[2]

startconfig = np.loadtxt(startconfig_file)


size_deuteron = 80
size_electron = 20
value_range = [-2, 2]

plt.xlim(value_range)
plt.ylim(value_range)

plt.xlabel("x / Bohrs")
plt.ylabel("z / Bohrs")
plt.gca().set(adjustable='box', aspect='equal')    # this ensures that the subplots are square
for i in range(3):    # plot deuterons
    plt.scatter(startconfig[0,i], startconfig[2,i], s=size_deuteron, c='tab:blue')
for i in range(3,5):  # plot electrons
    plt.scatter(startconfig[0,i], startconfig[2,i], s=size_electron, c='tab:red')

plt.savefig(outfile, dpi=300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
from sklearn_extra.cluster import KMedoids
import numpy as np
import sys

dmfile = sys.argv[1]
outfolder = sys.argv[2]


distance_matrix = np.load(dmfile)

kmedoids1 = KMedoids(n_clusters=1, metric='precomputed').fit(distance_matrix)
np.save(f'{outfolder}/nclusters1_medoid_indices', kmedoids1.medoid_indices_)
np.save(f'{outfolder}/nclusters1_labels', kmedoids1.labels_)
print(kmedoids1.medoid_indices_)
print(kmedoids1.labels_)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import matplotlib.pyplot as plt
import sys
import numpy as np

samplesfile = sys.argv[1]
outfile = sys.argv[2]

xmin = ymin = -1.5
xmax = ymax = 1.5

randangles = [3.230938189452668, 0.6911311486598657, 6.198400566085575, 5.5571267118566965, 1.3296570681242155, 2.013579035210648] # random.random()*2*np.pi


size = 200

# six samples
nselection = 6
Nnuc = 3    # we have three deuterons
colors_samples = ['tab:red', 'tab:green', 'tab:blue', 'tab:olive', 'tab:purple', 'tab:orange']
coords = np.load(samplesfile)
nsteps = coords.shape[0]
for i in range(nselection):
    index = (nsteps//nselection)*(i+1) - 1
    for j in range(Nnuc):    # plot deuterons
        x = np.cos(randangles[i])*coords[index,0,j] + np.sin(randangles[i])*coords[index,1,j]
        y = -np.sin(randangles[i])*coords[index,0,j] + np.cos(randangles[i])*coords[index,1,j]
        plt.scatter(x,y, s=size, c=colors_samples[i])

plt.xlim([xmin,xmax])
plt.ylim([ymin,ymax])
plt.xlabel("x / Bohrs")
plt.ylabel("y / Bohrs")
plt.gca().set(adjustable='box', aspect='equal')

plt.tight_layout()
plt.savefig(outfile, dpi=300)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import sys
import numpy as np

folder = sys.argv[1]

r = np.loadtxt(f"{folder}/startconfig_ppcoord")
r_COM = np.round(np.loadtxt(f"{folder}/startconfig_COM"), decimals = 4)

table_pp = fr"""\begin{{tabular}}{{cccc}}
\hline\hline
 & $x$ & $y$ & $z$ \\ \hline
 $r_{{\text{{D}}_1\text{{D}}_2}}$ & ${r[0]}$ & ${r[1]}$ & ${r[2]}$ \\
 $r_{{\text{{D}}_1\text{{D}}_3}}$ & ${r[3]}$ & ${r[4]}$ & ${r[5]}$ \\
 $r_{{\text{{D}}_1\text{{e}}_1}}$ & ${r[6]}$ & ${r[7]}$ & ${r[8]}$ \\
 $r_{{\text{{D}}_1\text{{e}}_2}}$ & ${r[9]}$ & ${r[10]}$ & ${r[11]}$ \\ \hline\hline
\end{{tabular}}"""  # literal curly braces in the string must be repeated!

table_COM = fr"""\begin{{tabular}}{{cccc}}
\hline\hline
 & $x$ & $y$ & $z$ \\ \hline
 $R^\text{{COM}}_{{\text{{D}}_1}}$ & ${r_COM[0,0]}$ & ${r_COM[1,0]}$ & ${r_COM[2,0]}$ \\
 $R^\text{{COM}}_{{\text{{D}}_2}}$ & ${r_COM[0,1]}$ & ${r_COM[1,1]}$ & ${r_COM[2,1]}$ \\
 $R^\text{{COM}}_{{\text{{D}}_3}}$ & ${r_COM[0,2]}$ & ${r_COM[1,2]}$ & ${r_COM[2,2]}$ \\
 $R^\text{{COM}}_{{\text{{e}}_1}}$ & ${r_COM[0,3]}$ & ${r_COM[1,3]}$ & ${r_COM[2,3]}$ \\
 $R^\text{{COM}}_{{\text{{e}}_2}}$ & ${r_COM[0,4]}$ & ${r_COM[1,4]}$ & ${r_COM[2,4]}$ \\ \hline\hline
\end{{tabular}}"""  # literal curly braces in the string must be repeated!

with open(f"{folder}/table_pp.tex", "w") as outfile:
    print(table_pp, file=outfile)

with open(f"{folder}/table_COM.tex", "w") as outfile:
    print(table_COM, file=outfile)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import sys
import numpy as np

folder = sys.argv[1]

Etot_sampling = round(np.loadtxt(f"{folder}/Etot_sampling").item(), 3)
Etot_analytical = round(np.loadtxt(f"{folder}/Etot_analytical").item(), 3)
rDD_sampling = round(np.loadtxt(f"{folder}/rDD_sampling").item(), 3)
rDD_analytical = round(np.loadtxt(f"{folder}/rDD_analytical").item(), 3)
rDe_sampling = round(np.loadtxt(f"{folder}/rDe_sampling").item(), 3)
rDe_analytical = round(np.loadtxt(f"{folder}/rDe_analytical").item(), 3)
ree_sampling = round(np.loadtxt(f"{folder}/ree_sampling").item(), 3)
ree_analytical = round(np.loadtxt(f"{folder}/ree_analytical").item(), 3)


table = fr"""\begin{{tabular}}{{lcccc}}
\hline \hline
& $\langle E_\text{{tot}} \rangle $ & $\langle r_\text{{DD}} \rangle $ & $\langle r_\text{{De}} \rangle$ & $\langle r_\text{{ee}} \rangle$ \\ \hline
Analytical: & ${Etot_analytical}$ & ${rDD_analytical}$ & ${rDe_analytical}$ & ${ree_analytical}$ \\
Sample: & ${Etot_sampling}$ & ${rDD_sampling}$ & ${rDe_sampling}$ & ${ree_sampling}$ \\ \hline\hline
\end{{tabular}}"""

with open(f"{folder}/table.tex", "w") as outfile:
    print(table, file=outfile)
32
33
34
35
36
shell:
    """
    mkdir -p {params.samplpath}/sampling_asymmetricstart
    julia scripts/sample.jl ../resources/parameters_D3+ {params.nsampling} {params.samplpath}/sampling_asymmetricstart
    """
47
48
shell:
    "julia scripts/write_distancematrix.jl ../resources/parameters_D3+ {params.samplpath}/sampling_asymmetricstart/sample {params.nclustering} {output}"
59
60
shell:
    "python scripts/run_kmedoids.py {input} {params.outfolder}"
72
73
74
75
76
shell:
    """
    mkdir -p {params.clustpath}/visualization
    julia scripts/align_samples_D3plus.jl {input} {params.nsampling} {params.nclustering} {output}
    """
88
89
shell:
    "python scripts/kde_estimate_nuclei_heatmap.py {input} {params.nclustering} 6 {output}"
102
103
104
105
106
shell:
    """
    mkdir -p {params.outfolder}
    julia scripts/write_samples_planebasis.jl ../resources/parameters_D3+ {params.samplpath}/sampling_asymmetricstart 6 {params.outfolder}
    """
118
119
shell:
    "python scripts/plot_samples.py {params.outfolder}"
131
132
shell:
    "python scripts/kde_estimate_electrons.py {input} {params.clusterinterval} {output}"
142
143
144
145
146
shell:
    """
    mkdir -p {params.outfolder}
    julia scripts/estimate_Etot.jl {input.sample} ../resources/parameters_D3+ {output}
    """
155
156
157
158
159
shell:
    """
    mkdir -p {params.outfolder}
    julia scripts/Etot_analytical.jl {input} {output}
    """
170
171
172
173
174
shell:
    """
    mkdir -p {params.outfolder}
    julia scripts/estimate_distances_D3plus.jl {input} {params.outfolder}
    """
185
186
187
188
189
shell:
    """
    mkdir -p {params.outfolder}
    julia scripts/distances_analytical.jl {input} {params.outfolder}
    """
205
206
207
208
shell:
    """
    python scripts/write_table.py {params.outfolder}
    """
219
220
shell:
    "python scripts/plot_equilibration.py {input} {params.nsteps} {output}"
227
228
shell:
    "julia scripts/optimize_jumpinglengths.jl ../resources/parameters_D3+ 50000 {output}"
235
236
237
238
239
240
shell:
    """
    mkdir -p ../results/plot_startconfig_asymmetric
    julia scripts/startconfig_planebasis.jl ../resources/parameters_D3+ ../results/plot_startconfig_asymmetric
    python scripts/plot_startconfig.py ../results/plot_startconfig_asymmetric/startconfig_COM ../results/plot_startconfig_asymmetric/startconfig.png
    """
249
250
shell:
    "python scripts/samples_randomorientation.py {input} {output}"
256
257
258
259
shell:
    """
    python scripts/write_startconfig_tables.py ../results/plot_startconfig_asymmetric/
    """
ShowHide 26 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/LucasLang/molecular_structure_analysis
Name: molecular_structure_analysis
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...