Skip to content

Commit

Permalink
Merge pull request #12 from rafaelbicudo/packaging
Browse files Browse the repository at this point in the history
Packaging
  • Loading branch information
rafaelbicudo authored Dec 19, 2024
2 parents e740cd3 + f468630 commit a568a59
Show file tree
Hide file tree
Showing 18 changed files with 437 additions and 158 deletions.
56 changes: 28 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,45 +107,45 @@ The number of clusters that were found, as well as the number of members for eac
Below there is an example of how this information is printed:
```
$ clusttraj trajectory.xyz -rmsd 3.2 -np 4 -p -n -cc xyz
2024-12-12 17:48:19,268 INFO [distmat.py:34] <get_distmat> Calculating distance matrix using 4 threads
2023-08-25 11:37:35,201 INFO [distmat.py:22] <get_distmat> Calculating distance matrix using 4 threads
2024-12-12 17:48:23,800 INFO [distmat.py:38] <get_distmat> Saving condensed distance matrix to distmat.npy
2023-08-25 11:37:35,770 INFO [distmat.py:26] <get_distmat> Saving condensed distance matrix to distmat.npy
2024-12-12 17:48:23,801 INFO [classify.py:97] <classify_structures> Clustering using 'average' method to join the clusters
2023-08-25 11:37:35,771 INFO [classify.py:8] <classify_structures> Starting clustering using 'average' method to join the clusters
2024-12-12 17:48:23,803 INFO [classify.py:105] <classify_structures> Saving clustering classification to clusters.dat
2023-08-25 11:37:35,772 INFO [classify.py:15] <classify_structures> Saving clustering classification to clusters.dat
2024-12-12 17:48:23,804 INFO [main.py:59] <main> Writing superposed configurations per cluster to files clusters_confs_*.xyz
2023-08-25 11:37:35,772 INFO [main.py:41] <main> Writing superposed configurations per cluster to files clusters_confs_*.xyz
2023-08-25 11:37:37,797 INFO [main.py:71] <main> A total 100 snapshots were read and 16 cluster(s) was(were) found.
2024-12-12 17:48:26,729 INFO [main.py:102] <main> A total 100 snapshots were read and 7 cluster(s) was(were) found.
The cluster sizes are:
Cluster Size
1 3
2 6
3 6
4 8
5 7
6 6
7 11
8 2
9 6
10 11
11 6
12 5
13 4
14 7
15 4
16 8
Cluster Size
1 3
2 3
3 31
4 30
5 18
6 3
7 12
2024-12-12 17:48:26,729 INFO [main.py:126] <main> Total wall time: 7.462641 s
```

In the cluster output file (`-oc` option, default filename `clusters.dat`) the classification for each structure in the trajectory is printed.
For example, if the first structure of the trajectory belongs to the cluster number *2*, the second structure belongs to cluster *1*, the third to cluster *2* and so on, the file `clusters.dat` will start with
```
2
1
2
...
$ head clusters.dat
7
4
5
3
4
7
6
7
4
3
```

The plot of the multidimensional representation (when the `-p` option is used) have each cluster colored in one color as the following picture:
Expand Down
10 changes: 6 additions & 4 deletions clusttraj/distmat.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,9 +192,10 @@ def compute_distmat_line(
Paview = Paview[prr]

# build the total structure reordering just these atoms
whereins = np.where(
np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
)
# whereins = np.where(
# np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
# )
whereins = np.where(np.atleast_1d(np.isin(np.arange(natoms), reorderexcl[soluexcl])))
Psolu = np.insert(
Pview,
[x - whereins[0].tolist().index(x) for x in whereins[0]],
Expand Down Expand Up @@ -263,7 +264,8 @@ def compute_distmat_line(
Pview = Pview[prr]

# build the total molecule with the reordered atoms
whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
# whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
whereins = np.where(np.atleast_1d(np.isin(np.arange(len(P)), reorderexcl)))
Pr = np.insert(
Pview,
[x - whereins[0].tolist().index(x) for x in whereins[0]],
Expand Down
12 changes: 7 additions & 5 deletions clusttraj/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,9 +774,10 @@ def save_clusters_config(
Paview = Paview[prr]

# build the total molecule reordering just these atoms
whereins = np.where(
np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
)
# whereins = np.where(
# np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
# )
whereins = np.where(np.atleast_1d(np.isin(np.arange(natoms), reorderexcl)))
Psolu = np.insert(
Pview,
[x - whereins[0].tolist().index(x) for x in whereins[0]],
Expand Down Expand Up @@ -812,7 +813,6 @@ def save_clusters_config(
# prr = reorder(Qa[solvview], Paview, Q[solvview], Pview)
# Pview = Pview[prr]
# Paview = Paview[prr]

# # build the total molecule with the reordered atoms
# whereins = np.where(
# np.isin(np.arange(natoms, len(P)), reorderexcl[solvexcl]) == True
Expand Down Expand Up @@ -849,7 +849,8 @@ def save_clusters_config(
Pview = Pview[prr]

# build the total molecule with the reordered atoms
whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
# whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
whereins = np.where(np.atleast_1d(np.isin(np.arange(len(P)), reorderexcl)))
Pr = np.insert(
Pview,
[x - whereins[0].tolist().index(x) for x in whereins[0]],
Expand Down Expand Up @@ -882,3 +883,4 @@ def save_clusters_config(

# closes the file for the cnum cluster
outfile.close()
# type: ignore
4 changes: 2 additions & 2 deletions clusttraj/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def main(args: List[str] = None) -> None:
start_time = time.monotonic()
plot_clust_evo(clust_opt, clusters)

plot_dendrogram(clust_opt, Z)
plot_dendrogram(clust_opt, clusters, Z)

plot_mds(clust_opt, clusters, distmat)

Expand All @@ -104,7 +104,7 @@ def main(args: List[str] = None) -> None:
# Compute the evaluation metrics
if clust_opt.metrics:
start_time = time.monotonic()
ss, ch, db, cpcc = compute_metrics(clust_opt, distmat, Z, clusters)
ss, ch, db, cpcc = compute_metrics(distmat, Z, clusters)
end_time = time.monotonic()

outclust_str += f"\nSilhouette score: {ss:.3f}\n"
Expand Down
10 changes: 6 additions & 4 deletions clusttraj/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,22 @@
from scipy.cluster.hierarchy import cophenet
from typing import Tuple
import numpy as np
from .io import ClustOptions
# from .io import ClustOptions


def compute_metrics(
clust_opt: ClustOptions,
# clust_opt: ClustOptions,
distmat: np.ndarray,
z_matrix: np.ndarray,
clusters: np.ndarray,
) -> Tuple[np.float64, np.float64, np.float64, np.float64]:
"""Compute metrics to assess the performance of the clustering procedure.
Args:
clust_opt (ClustOptions): The clustering options.
# clust_opt (ClustOptions): The clustering options.
distmat: The distance matrix.
z_matrix (np.ndarray): The Z-matrix from hierarchical clustering procedure.
clusters (np.ndarray): The cluster classifications for each sample.
Returns:
ss (np.float64): The silhouette score.
Expand All @@ -41,6 +43,6 @@ def compute_metrics(
db = davies_bouldin_score(squareform(distmat), clusters)

# Compute the cophenetic correlation coefficient
cpcc = cophenet(z_matrix)[0]
cpcc, _ = cophenet(z_matrix, distmat)

return ss, ch, db, cpcc
102 changes: 77 additions & 25 deletions clusttraj/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@
from sklearn import manifold
from scipy.spatial.distance import squareform
import scipy.cluster.hierarchy as hcl
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import to_hex
from matplotlib.ticker import MaxNLocator
import numpy as np
from .io import ClustOptions


def plot_clust_evo(clust_opt: ClustOptions, clusters: np.ndarray) -> None:
def plot_clust_evo(
clust_opt: ClustOptions,
clusters: np.ndarray
) -> None:
"""Plot the evolution of cluster classification over the given samples.
Args:
Expand All @@ -18,52 +25,91 @@ def plot_clust_evo(clust_opt: ClustOptions, clusters: np.ndarray) -> None:
Returns:
None
"""

# Define a color for the lines
line_color = (0, 0, 0, 0.5)

# plot evolution with o cluster in trajectory
plt.figure(figsize=(25, 10))
plt.plot(range(1, len(clusters) + 1), clusters, "o-", markersize=4)
plt.xlabel("Sample Index")
plt.ylabel("Cluster classification")
plt.figure(figsize=(10, 6))

# Set the y-axis to only show integers
plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))

# Increase tick size and font size
plt.tick_params(axis='both', which='major', direction='in', labelsize=12)

plt.plot(range(1, len(clusters) + 1), clusters, markersize=4, color=line_color)
plt.scatter(range(1, len(clusters) + 1), clusters, marker="o", c=clusters, cmap=plt.cm.nipy_spectral)
plt.xlabel("Sample Index", fontsize=14)
plt.ylabel("Cluster classification", fontsize=14)
plt.savefig(clust_opt.evo_name, bbox_inches="tight")


def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray) -> None:
def plot_dendrogram(
clust_opt: ClustOptions,
clusters: np.ndarray,
Z: np.ndarray
) -> None:
"""Plot a dendrogram based on hierarchical clustering.
Parameters:
clust_opt (ClustOptions): The options for clustering.
clusters (np.ndarray): The cluster labels.
Z (np.ndarray): The linkage matrix.
Returns:
None
"""
# Plot the dendrogram
plt.figure(figsize=(25, 10))
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Sample Index")
plt.ylabel(r"RMSD ($\AA$)")
plt.figure(figsize=(18, 6))
plt.title("Hierarchical Clustering Dendrogram", fontsize=20)
# plt.xlabel("Sample Index", fontsize=14)
plt.ylabel(r"RMSD ($\AA$)", fontsize=18)
plt.tick_params(axis='y', labelsize=18)

hcl.dendrogram(
Z,
leaf_rotation=90.0, # Rotates the x axis labels
leaf_font_size=8.0, # Font size for the x axis labels
)
# Define a color for the dashed and non-cluster lines
line_color = (0, 0, 0, 0.5)

# Add a horizontal line at the minimum RMSD value
# Add a horizontal line at the minimum RMSD value and set the threshold
if clust_opt.silhouette_score:
if isinstance(clust_opt.optimal_cut, (np.ndarray, list)):
plt.axhline(clust_opt.optimal_cut[0], linestyle="--")
plt.axhline(clust_opt.optimal_cut[0], linestyle="--", linewidth=2, color=line_color)
threshold = clust_opt.optimal_cut[0]
elif isinstance(clust_opt.optimal_cut, (float, np.float32, np.float64)):
plt.axhline(clust_opt.optimal_cut, linestyle="--")
plt.axhline(clust_opt.optimal_cut, linestyle="--", linewidth=2, color=line_color)
threshold = clust_opt.optimal_cut
else:
raise ValueError("optimal_cut must be a float or np.ndarray")
else:
plt.axhline(clust_opt.min_rmsd, linestyle="--")
plt.axhline(clust_opt.min_rmsd, linestyle="--", linewidth=2, color=line_color)
threshold = clust_opt.min_rmsd

# Use the 'nipy_spectral' cmap to color the dendrogram
unique_clusters = np.unique(clusters)
cmap = cm.get_cmap('nipy_spectral', len(unique_clusters))
colors = [to_hex(cmap(i)) for i in range(cmap.N)]

hierarchy.set_link_color_palette(colors)

# Plot the dendrogram
hcl.dendrogram(
Z,
# leaf_rotation=90.0, # Rotates the x axis labels
# leaf_font_size=8.0, # Font size for the x axis labels
no_labels=True,
color_threshold=threshold,
above_threshold_color=line_color
)

# Save the dendrogram to a file
plt.savefig(clust_opt.dendrogram_name, bbox_inches="tight")


def plot_mds(clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray) -> None:
def plot_mds(
clust_opt: ClustOptions,
clusters: np.ndarray,
distmat: np.ndarray
) -> None:
"""Plot the multidimensional scaling (MDS) of the distance matrix.
Args:
Expand Down Expand Up @@ -92,6 +138,9 @@ def plot_mds(clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray)
# Perform MDS and get the 2D representation
coords = mds.fit_transform(squareform(distmat))

# Set the figure size
plt.figure(figsize=(6,6))

# Configure tick parameters
plt.tick_params(
axis="both",
Expand All @@ -109,14 +158,16 @@ def plot_mds(clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray)
coords[:, 0], coords[:, 1], marker="o", c=clusters, cmap=plt.cm.nipy_spectral
)

plt.title("MDS Visualization")
plt.title("MDS Visualization", fontsize=14)

# Save the plot
plt.savefig(clust_opt.mds_name, bbox_inches="tight")


def plot_tsne(
clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray
clust_opt: ClustOptions,
clusters: np.ndarray,
distmat: np.ndarray
) -> None:
"""Plot the t-distributed Stochastic Neighbor Embedding 2D plot of the clustering.
Expand All @@ -143,10 +194,11 @@ def plot_tsne(

# Define a list of unique colors for each cluster
unique_clusters = np.unique(clusters)
colors = plt.cm.tab20(np.linspace(0, 1, len(unique_clusters)))
cmap = cm.get_cmap('nipy_spectral', len(unique_clusters))
colors = [cmap(i) for i in range(len(unique_clusters))]

# Create a new figure
plt.figure()
# Set the figure size
plt.figure(figsize=(6,6))

# Configure tick parameters
plt.tick_params(
Expand Down
Loading

0 comments on commit a568a59

Please sign in to comment.