From 5865434d8ba5c0efcb13c13e0afbb938b9804aec Mon Sep 17 00:00:00 2001
From: "Eric T. Johnson" <eric.t.johnson@stonybrook.edu>
Date: Fri, 1 Dec 2023 17:35:15 -0500
Subject: [PATCH] Add python code to read diagnostic files and deduplicate them

---
 Util/scripts/diag_parser.py | 99 +++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 Util/scripts/diag_parser.py
diff --git a/Util/scripts/diag_parser.py b/Util/scripts/diag_parser.py
new file mode 100644
index 0000000000..aef233cc44
--- /dev/null
+++ b/Util/scripts/diag_parser.py
@@ -0,0 +1,99 @@
+"""Helper functions for working with Castro diagnostic files (*_diag.out)
+
+To use these in a standalone script, you can do one of the following:
+
+* append $CASTRO_HOME/Util/scripts to sys.path at the top of your script:
+    sys.path.append("<path to Castro>/Util/scripts")
+
+* add a symlink to this file in the same directory as your script:
+    $ ln -s "$CASTRO_HOME/Util/scripts/diag_parser.py" .
+
+* copy this file into the same directory as your script
+
+Then you can do `from diag_parser import deduplicate, read_diag_file`.
+"""
+
+from pathlib import Path
+
+import numpy as np
+
+""" Format notes
+files are opened in Castro.cpp, data is written in sum_integrated_quantities.cpp
+
+data_logs[0]: grid_diag.out
+intwidth, fixwidth, datwidth*
+
+data_logs[1]: gravity_diag.out
+- this was previously missing the last column number (8), which we handle for
+  backwards compatibility
+intwidth, fixwidth, datwidth, datwidth, datwidth, datwidth, datwidth, datwidth
+
+data_logs[2]: species_diag.out
+intwidth, fixwidth, datwidth*
+
+data_logs[3]: amr_diag.out
+- if compiled with GPU support, this will have two additional integer fields at
+  the end with size `datwidth` for the GPU memory usage
+- column 5 (max number of subcycles) is an integer
+intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth
+"""
+
+datwidth = 25  # Floating point data in scientific notation
+fixwidth = 25  # Floating point data not in scientific notation
+intwidth = 12  # Integer data
+
+# Any additional columns after these are assumed to be floating point values in
+# scientific notation (amr_diag.out gets special handling)
+FIELD_WIDTHS = {
+    "grid_diag.out": [intwidth, fixwidth],
+    "gravity_diag.out": [intwidth, fixwidth] + [datwidth] * 6,
+    "species_diag.out": [intwidth, fixwidth],
+    "amr_diag.out": [intwidth, fixwidth, fixwidth, intwidth, fixwidth, datwidth],
+}
+
+
+def read_diag_file(file_path):
+    """Reads a Castro diagnostic file into a numpy structured array.
+
+    Currently only supports the default files that Castro generates.
+    """
+    if not isinstance(file_path, Path):
+        file_path = Path(file_path)
+    filetype = file_path.name
+    if filetype not in FIELD_WIDTHS:
+        raise ValueError("Unsupported file name")
+    widths = FIELD_WIDTHS[filetype]
+    with open(file_path, "r") as f:
+        # try getting the number of columns from the first line
+        first_line = f.readline().rstrip("\n")
+        if filetype == "gravity_diag.out":
+            # gravity_diag.out is missing the last column number, but it
+            # fortunately has a fixed number of columns
+            num_columns = 8
+        else:
+            num_columns = int(first_line.split()[-1])
+        # pad out the widths list on the right if necessary
+        widths.extend([datwidth] * (num_columns - len(widths)))
+        # infer datatypes from the widths
+        dtypes = [int if w == intwidth else float for w in widths]
+        # amr_diag.out has several integer columns with long names
+        if filetype == "amr_diag.out":
+            dtypes[4] = int  # max number of subcycles
+            if num_columns >= 8:
+                dtypes[6] = int  # maximum gpu memory used
+                dtypes[7] = int  # minimum gpu memory free
+        # already read the first header line, so we don't need to skip any rows
+        data = np.genfromtxt(
+            f, delimiter=widths, comments="#", dtype=dtypes, names=True
+        )
+    return data
+
+
+def deduplicate(data):
+    """Deduplicate based on the timestep, keeping the only last occurrence."""
+    # get the unique indices into the reversed timestep array, so we find the
+    # final occurrence of each timestep
+    _, rev_indices = np.unique(data["TIMESTEP"][::-1], return_index=True)
+    # np.unique() sorts by value, so we don't need to un-reverse rev_indices
+    unique_indices = data.shape[0] - rev_indices - 1
+    return data[unique_indices]