Merge pull request #12 from rafaelbicudo/packaging

Packaging
hmcezar · Dec 19, 2024 · a568a59 · a568a59
2 parents e740cd3 + f468630
commit a568a59
Show file tree

Hide file tree

Showing 18 changed files with 437 additions and 158 deletions.
diff --git a/README.md b/README.md
@@ -107,45 +107,45 @@ The number of clusters that were found, as well as the number of members for eac
 Below there is an example of how this information is printed:
 ```
 $ clusttraj trajectory.xyz -rmsd 3.2 -np 4 -p -n -cc xyz
+2024-12-12 17:48:19,268 INFO     [distmat.py:34] <get_distmat> Calculating distance matrix using 4 threads
 
-2023-08-25 11:37:35,201 INFO     [distmat.py:22] <get_distmat> Calculating distance matrix using 4 threads
+2024-12-12 17:48:23,800 INFO     [distmat.py:38] <get_distmat> Saving condensed distance matrix to distmat.npy
 
-2023-08-25 11:37:35,770 INFO     [distmat.py:26] <get_distmat> Saving condensed distance matrix to distmat.npy
+2024-12-12 17:48:23,801 INFO     [classify.py:97] <classify_structures> Clustering using 'average' method to join the clusters
 
-2023-08-25 11:37:35,771 INFO     [classify.py:8] <classify_structures> Starting clustering using 'average' method to join the clusters
+2024-12-12 17:48:23,803 INFO     [classify.py:105] <classify_structures> Saving clustering classification to clusters.dat
 
-2023-08-25 11:37:35,772 INFO     [classify.py:15] <classify_structures> Saving clustering classification to clusters.dat
+2024-12-12 17:48:23,804 INFO     [main.py:59] <main> Writing superposed configurations per cluster to files clusters_confs_*.xyz
 
-2023-08-25 11:37:35,772 INFO     [main.py:41] <main> Writing superposed configurations per cluster to files clusters_confs_*.xyz
-
-2023-08-25 11:37:37,797 INFO     [main.py:71] <main> A total 100 snapshots were read and 16 cluster(s) was(were) found.
+2024-12-12 17:48:26,729 INFO     [main.py:102] <main> A total 100 snapshots were read and 7 cluster(s) was(were) found.
 The cluster sizes are:
-Cluster Size
-1       3
-2       6
-3       6
-4       8
-5       7
-6       6
-7       11
-8       2
-9       6
-10      11
-11      6
-12      5
-13      4
-14      7
-15      4
-16      8
+Cluster	Size
+1	3
+2	3
+3	31
+4	30
+5	18
+6	3
+7	12
+
+2024-12-12 17:48:26,729 INFO     [main.py:126] <main> Total wall time: 7.462641 s
+
 ```
 
 In the cluster output file (`-oc` option, default filename `clusters.dat`) the classification for each structure in the trajectory is printed.
 For example, if the first structure of the trajectory belongs to the cluster number *2*, the second structure belongs to cluster *1*, the third to cluster *2* and so on, the file `clusters.dat` will start with
 ```
-2
-1
-2
-...
+$ head clusters.dat
+7
+4
+5
+3
+4
+7
+6
+7
+4
+3
 ```
 
 The plot of the multidimensional representation (when the `-p` option is used) have each cluster colored in one color as the following picture:

diff --git a/clusttraj/distmat.py b/clusttraj/distmat.py
@@ -192,9 +192,10 @@ def compute_distmat_line(
                 Paview = Paview[prr]
 
                 # build the total structure reordering just these atoms
-                whereins = np.where(
-                    np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
-                )
+                # whereins = np.where(
+                #     np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
+                # )
+                whereins = np.where(np.atleast_1d(np.isin(np.arange(natoms), reorderexcl[soluexcl])))
                 Psolu = np.insert(
                     Pview,
                     [x - whereins[0].tolist().index(x) for x in whereins[0]],
@@ -263,7 +264,8 @@ def compute_distmat_line(
             Pview = Pview[prr]
 
             # build the total molecule with the reordered atoms
-            whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
+            # whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
+            whereins = np.where(np.atleast_1d(np.isin(np.arange(len(P)), reorderexcl)))
             Pr = np.insert(
                 Pview,
                 [x - whereins[0].tolist().index(x) for x in whereins[0]],

diff --git a/clusttraj/io.py b/clusttraj/io.py
@@ -774,9 +774,10 @@ def save_clusters_config(
                     Paview = Paview[prr]
 
                     # build the total molecule reordering just these atoms
-                    whereins = np.where(
-                        np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
-                    )
+                    # whereins = np.where(
+                    #     np.isin(np.arange(natoms), reorderexcl[soluexcl]) is True
+                    # )
+                    whereins = np.where(np.atleast_1d(np.isin(np.arange(natoms), reorderexcl)))
                     Psolu = np.insert(
                         Pview,
                         [x - whereins[0].tolist().index(x) for x in whereins[0]],
@@ -812,7 +813,6 @@ def save_clusters_config(
                     # prr = reorder(Qa[solvview], Paview, Q[solvview], Pview)
                     # Pview = Pview[prr]
                     # Paview = Paview[prr]
-
                     # # build the total molecule with the reordered atoms
                     # whereins = np.where(
                     #     np.isin(np.arange(natoms, len(P)), reorderexcl[solvexcl]) == True
@@ -849,7 +849,8 @@ def save_clusters_config(
                 Pview = Pview[prr]
 
                 # build the total molecule with the reordered atoms
-                whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
+                # whereins = np.where(np.isin(np.arange(len(P)), reorderexcl) is True)
+                whereins = np.where(np.atleast_1d(np.isin(np.arange(len(P)), reorderexcl)))
                 Pr = np.insert(
                     Pview,
                     [x - whereins[0].tolist().index(x) for x in whereins[0]],
@@ -882,3 +883,4 @@ def save_clusters_config(
 
         # closes the file for the cnum cluster
         outfile.close()
+ # type: ignore
diff --git a/clusttraj/main.py b/clusttraj/main.py
@@ -83,7 +83,7 @@ def main(args: List[str] = None) -> None:
         start_time = time.monotonic()
         plot_clust_evo(clust_opt, clusters)
 
-        plot_dendrogram(clust_opt, Z)
+        plot_dendrogram(clust_opt, clusters, Z)
 
         plot_mds(clust_opt, clusters, distmat)
 
@@ -104,7 +104,7 @@ def main(args: List[str] = None) -> None:
     # Compute the evaluation metrics
     if clust_opt.metrics:
         start_time = time.monotonic()
-        ss, ch, db, cpcc = compute_metrics(clust_opt, distmat, Z, clusters)
+        ss, ch, db, cpcc = compute_metrics(distmat, Z, clusters)
         end_time = time.monotonic()
 
         outclust_str += f"\nSilhouette score: {ss:.3f}\n"

diff --git a/clusttraj/metrics.py b/clusttraj/metrics.py
@@ -9,20 +9,22 @@
 from scipy.cluster.hierarchy import cophenet
 from typing import Tuple
 import numpy as np
-from .io import ClustOptions
+# from .io import ClustOptions
 
 
 def compute_metrics(
-    clust_opt: ClustOptions,
+    # clust_opt: ClustOptions,
     distmat: np.ndarray,
     z_matrix: np.ndarray,
     clusters: np.ndarray,
 ) -> Tuple[np.float64, np.float64, np.float64, np.float64]:
     """Compute metrics to assess the performance of the clustering procedure.
 
         Args:
-        clust_opt (ClustOptions): The clustering options.
+        # clust_opt (ClustOptions): The clustering options.
+        distmat: The distance matrix.
         z_matrix (np.ndarray): The Z-matrix from hierarchical clustering procedure.
+        clusters (np.ndarray): The cluster classifications for each sample.
 
     Returns:
         ss (np.float64): The silhouette score.
@@ -41,6 +43,6 @@ def compute_metrics(
     db = davies_bouldin_score(squareform(distmat), clusters)
 
     # Compute the cophenetic correlation coefficient
-    cpcc = cophenet(z_matrix)[0]
+    cpcc, _ = cophenet(z_matrix, distmat)
 
     return ss, ch, db, cpcc
diff --git a/clusttraj/plot.py b/clusttraj/plot.py
@@ -3,12 +3,19 @@
 from sklearn import manifold
 from scipy.spatial.distance import squareform
 import scipy.cluster.hierarchy as hcl
+from scipy.cluster import hierarchy
 import matplotlib.pyplot as plt
+import matplotlib.cm as cm
+from matplotlib.colors import to_hex
+from matplotlib.ticker import MaxNLocator
 import numpy as np
 from .io import ClustOptions
 
 
-def plot_clust_evo(clust_opt: ClustOptions, clusters: np.ndarray) -> None:
+def plot_clust_evo(
+        clust_opt: ClustOptions, 
+        clusters: np.ndarray
+) -> None:
     """Plot the evolution of cluster classification over the given samples.
 
     Args:
@@ -18,52 +25,91 @@ def plot_clust_evo(clust_opt: ClustOptions, clusters: np.ndarray) -> None:
     Returns:
         None
     """
+
+    # Define a color for the lines
+    line_color = (0, 0, 0, 0.5)
+
     # plot evolution with o cluster in trajectory
-    plt.figure(figsize=(25, 10))
-    plt.plot(range(1, len(clusters) + 1), clusters, "o-", markersize=4)
-    plt.xlabel("Sample Index")
-    plt.ylabel("Cluster classification")
+    plt.figure(figsize=(10, 6))
+
+    # Set the y-axis to only show integers
+    plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))
+
+    # Increase tick size and font size
+    plt.tick_params(axis='both', which='major', direction='in', labelsize=12)
+
+    plt.plot(range(1, len(clusters) + 1), clusters, markersize=4, color=line_color)
+    plt.scatter(range(1, len(clusters) + 1), clusters, marker="o", c=clusters, cmap=plt.cm.nipy_spectral)
+    plt.xlabel("Sample Index", fontsize=14)
+    plt.ylabel("Cluster classification", fontsize=14)
     plt.savefig(clust_opt.evo_name, bbox_inches="tight")
 
 
-def plot_dendrogram(clust_opt: ClustOptions, Z: np.ndarray) -> None:
+def plot_dendrogram(
+        clust_opt: ClustOptions,
+        clusters: np.ndarray,
+        Z: np.ndarray
+) -> None:
     """Plot a dendrogram based on hierarchical clustering.
 
     Parameters:
         clust_opt (ClustOptions): The options for clustering.
+        clusters (np.ndarray): The cluster labels.
         Z (np.ndarray): The linkage matrix.
 
     Returns:
         None
     """
     # Plot the dendrogram
-    plt.figure(figsize=(25, 10))
-    plt.title("Hierarchical Clustering Dendrogram")
-    plt.xlabel("Sample Index")
-    plt.ylabel(r"RMSD ($\AA$)")
+    plt.figure(figsize=(18, 6))
+    plt.title("Hierarchical Clustering Dendrogram", fontsize=20)
+    # plt.xlabel("Sample Index", fontsize=14)
+    plt.ylabel(r"RMSD ($\AA$)", fontsize=18)
+    plt.tick_params(axis='y', labelsize=18)
 
-    hcl.dendrogram(
-        Z,
-        leaf_rotation=90.0,  # Rotates the x axis labels
-        leaf_font_size=8.0,  # Font size for the x axis labels
-    )
+    # Define a color for the dashed and non-cluster lines
+    line_color = (0, 0, 0, 0.5)
 
-    # Add a horizontal line at the minimum RMSD value
+    # Add a horizontal line at the minimum RMSD value and set the threshold
     if clust_opt.silhouette_score:
         if isinstance(clust_opt.optimal_cut, (np.ndarray, list)):
-            plt.axhline(clust_opt.optimal_cut[0], linestyle="--")
+            plt.axhline(clust_opt.optimal_cut[0], linestyle="--", linewidth=2, color=line_color)
+            threshold = clust_opt.optimal_cut[0]
         elif isinstance(clust_opt.optimal_cut, (float, np.float32, np.float64)):
-            plt.axhline(clust_opt.optimal_cut, linestyle="--")
+            plt.axhline(clust_opt.optimal_cut, linestyle="--", linewidth=2, color=line_color)
+            threshold = clust_opt.optimal_cut
         else:
             raise ValueError("optimal_cut must be a float or np.ndarray")
     else:
-        plt.axhline(clust_opt.min_rmsd, linestyle="--")
+        plt.axhline(clust_opt.min_rmsd, linestyle="--", linewidth=2, color=line_color)
+        threshold = clust_opt.min_rmsd
+
+    # Use the 'nipy_spectral' cmap to color the dendrogram
+    unique_clusters = np.unique(clusters)
+    cmap = cm.get_cmap('nipy_spectral', len(unique_clusters))
+    colors = [to_hex(cmap(i)) for i in range(cmap.N)]
+
+    hierarchy.set_link_color_palette(colors)
+
+    # Plot the dendrogram
+    hcl.dendrogram(
+        Z,
+        # leaf_rotation=90.0,  # Rotates the x axis labels
+        # leaf_font_size=8.0,  # Font size for the x axis labels
+        no_labels=True,
+        color_threshold=threshold,
+        above_threshold_color=line_color
+    )
 
     # Save the dendrogram to a file
     plt.savefig(clust_opt.dendrogram_name, bbox_inches="tight")
 
 
-def plot_mds(clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray) -> None:
+def plot_mds(
+        clust_opt: ClustOptions, 
+        clusters: np.ndarray, 
+        distmat: np.ndarray
+) -> None:
     """Plot the multidimensional scaling (MDS) of the distance matrix.
 
     Args:
@@ -92,6 +138,9 @@ def plot_mds(clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray)
     # Perform MDS and get the 2D representation
     coords = mds.fit_transform(squareform(distmat))
 
+    # Set the figure size
+    plt.figure(figsize=(6,6))
+
     # Configure tick parameters
     plt.tick_params(
         axis="both",
@@ -109,14 +158,16 @@ def plot_mds(clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray)
         coords[:, 0], coords[:, 1], marker="o", c=clusters, cmap=plt.cm.nipy_spectral
     )
 
-    plt.title("MDS Visualization")
+    plt.title("MDS Visualization", fontsize=14)
 
     # Save the plot
     plt.savefig(clust_opt.mds_name, bbox_inches="tight")
 
 
 def plot_tsne(
-    clust_opt: ClustOptions, clusters: np.ndarray, distmat: np.ndarray
+    clust_opt: ClustOptions, 
+    clusters: np.ndarray, 
+    distmat: np.ndarray
 ) -> None:
     """Plot the t-distributed Stochastic Neighbor Embedding 2D plot of the clustering.
 
@@ -143,10 +194,11 @@ def plot_tsne(
 
     # Define a list of unique colors for each cluster
     unique_clusters = np.unique(clusters)
-    colors = plt.cm.tab20(np.linspace(0, 1, len(unique_clusters)))
+    cmap = cm.get_cmap('nipy_spectral', len(unique_clusters))
+    colors = [cmap(i) for i in range(len(unique_clusters))]
 
-    # Create a new figure
-    plt.figure()
+    # Set the figure size
+    plt.figure(figsize=(6,6))
 
     # Configure tick parameters
     plt.tick_params(