HEP-PBSP · comane · Apr 14, 2024 · Apr 14, 2024 · Apr 14, 2024 · Apr 14, 2024
diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
@@ -13,6 +13,8 @@
 from validphys.core import peek_commondata_metadata
 from validphys.coredata import CommonData
 
+EXT = "pineappl.lz4"
+
 def load_commondata(spec):
     """
     Load the data corresponding to a CommonDataSpec object.
@@ -53,7 +55,7 @@ def parse_commondata(commondatafile, systypefile, setname):
     commondatatable.columns = commondataheader
     commondatatable.set_index("entry", inplace=True)
     ndata = len(commondatatable)
-    commondataproc = commondatatable["process"][1]
+    commondataproc = commondatatable["process"].iloc[0]
     # Check for consistency with commondata metadata
     cdmetadata =  peek_commondata_metadata(commondatafile)
     if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata):

diff --git a/validphys2/src/validphys/commondatawriter.py b/validphys2/src/validphys/commondatawriter.py
@@ -0,0 +1,86 @@
+import numpy as np
+"""
+This module contains functions to write commondata and systypes
+tables to files
+"""
+
+
+def write_commondata_data(commondata, buffer):
+    """
+    write commondata table to buffer, this can be a memory map,
+    compressed archive or strings (using for instance StringIO)
+
+
+    Parameters
+    ----------
+
+    commondata : validphys.coredata.CommonData
+
+    buffer : memory map, compressed archive or strings
+            example: StringIO object
+
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> from io import StringIO
+
+    >>> l = Loader()
+    >>> cd = l.check_commondata("NMC").load_commondata_instance()
+    >>> sio = StringIO()
+    >>> write_commondata_data(cd,sio)
+    >>> print(sio.getvalue())
+
+    """
+    header = f"{commondata.setname} {commondata.nsys} {commondata.ndata}\n"
+    buffer.write(header)
+    commondata.commondata_table.index = np.arange(1, len(commondata.commondata_table)+1)
+    commondata.commondata_table.to_csv(buffer, float_format="%20.12e", sep="\t", header=None)
+
+
+def write_commondata_to_file(commondata, path):
+    """
+    write commondata table to file
+    """
+    with open(path, "w") as file:
+        write_commondata_data(commondata, file)
+
+
+def write_systype_data(commondata, buffer):
+    """
+    write systype table to buffer, this can be a memory map,
+    compressed archive or strings (using for instance StringIO)
+
+
+    Parameters
+    ----------
+
+    commondata : validphys.coredata.CommonData
+
+    buffer : memory map, compressed archive or strings
+            example: StringIO object
+
+
+    Example
+    -------
+    >>> from validphys.loader import Loader
+    >>> from io import StringIO
+
+    >>> l = Loader()
+    >>> cd = l.check_commondata("NMC").load_commondata_instance()
+    >>> sio = StringIO()
+    >>> write_systype_data(cd,sio)
+    >>> print(sio.getvalue())
+
+    """
+    header = f"{commondata.nsys}\n"
+    buffer.write(header)
+    commondata.systype_table.to_csv(buffer, sep="\t", header=None)
+
+
+def write_systype_to_file(commondata, path):
+    """
+    write systype table to file
+    """
+    with open(path, "w") as file:
+        write_systype_data(commondata, file)
diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -512,7 +512,7 @@ def produce_simu_parameters_linear_combinations(self, simu_parameters=None):
     def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_parameters_scales, n_simu_parameters, simu_parameters_linear_combinations, simu_parameters=None):
         """The mapping that corresponds to the dataset specifications in the
         fit files"""
-        known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions", "contamination"}
+        known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions", "contamination", "new_commondata"}
         try:
             name = dataset["dataset"]
             if not isinstance(name, str):
@@ -522,6 +522,7 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para
                 "'dataset' must be a mapping with " "'dataset' and 'sysnum'"
             )
 
+        new_commondata = dataset.get("new_commondata", False)
         sysnum = dataset.get("sys")
         cfac = dataset.get("cfac", tuple())
         frac = dataset.get("frac", 1)
@@ -564,7 +565,8 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para
             custom_group=custom_group,
             use_fixed_predictions=use_fixed_predictions,
             contamination=contamination,
-            **bsm_data
+            **bsm_data,
+            new_commondata=new_commondata,
         )
 
     def parse_use_fitcommondata(self, do_use: bool):
@@ -751,6 +753,7 @@ def produce_dataset(
         use_fixed_predictions = dataset_input.use_fixed_predictions
         contamination = dataset_input.contamination
         contamination_data = contamination_data
+        new_commondata = dataset_input.new_commondata
 
         try:
             ds = self.loader.check_dataset(
@@ -768,6 +771,7 @@ def produce_dataset(
                 use_fixed_predictions=use_fixed_predictions,
                 contamination=contamination,
                 contamination_data=contamination_data,
+                new_commondata=new_commondata,
             )
         except DataNotFoundError as e:
             raise ConfigError(str(e), name, self.loader.available_datasets)

diff --git a/validphys2/src/validphys/convolution.py b/validphys2/src/validphys/convolution.py
@@ -105,7 +105,7 @@ def _predictions(dataset, pdf, fkfunc):
     all replicas, central, etc) according to the provided ``fkfunc``, which
     should have the same interface as e.g. ``fk_predictions``.
     """
-    opfunc = OP[dataset.op]
+    opfunc = OP[dataset.op.upper()]
     if dataset.cuts is None:
         raise PredictionsRequireCutsError(
             "FKTables do not always generate predictions for some datapoints "
@@ -119,17 +119,17 @@ def _predictions(dataset, pdf, fkfunc):
     # predictions instead.
     all_predictions = []
     for fk in dataset.fkspecs:
-         if not fk.use_fixed_predictions:
-             all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf))
-         else:
-             with open(fk.fixed_predictions_path, 'rb') as f:
-                 fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])
-             # Now need to reshape it according it to the expected number of predictions
-             if fkfunc == central_fk_predictions:
-                 all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data']))
-             elif fkfunc == fk_predictions:
-                 fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1))
-                 all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())]))
+        if not fk.use_fixed_predictions:
+            all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf))
+        else:
+            with open(fk.fixed_predictions_path, 'rb') as f:
+                fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])
+            # Now need to reshape it according it to the expected number of predictions
+            if fkfunc == central_fk_predictions:
+                all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data']))
+            elif fkfunc == fk_predictions:
+                fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1))
+                all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())]))
 
     return opfunc(*all_predictions)
 

diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
@@ -327,7 +327,7 @@ def load_commondata(self, cuts=None):
         if cuts is not None:
             cd = cd.with_cuts(cuts)
         return cd
-
+    
     @property
     def plot_kinlabels(self):
         return get_plot_kinlabels(self)
@@ -336,7 +336,7 @@ def plot_kinlabels(self):
 class DataSetInput(TupleComp):
     """Represents whatever the user enters in the YAML to specify a
     dataset."""
-    def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, simu_parameters_linear_combinations, use_fixed_predictions, contamination):
+    def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, simu_parameters_linear_combinations, use_fixed_predictions, contamination, new_commondata):
         self.name=name
         self.sys=sys
         self.cfac = cfac
@@ -347,6 +347,7 @@ def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_paramete
         self.simu_parameters_linear_combinations = simu_parameters_linear_combinations
         self.use_fixed_predictions = use_fixed_predictions
         self.contamination = contamination
+        self.new_commondata = new_commondata
         super().__init__(name, sys, cfac, frac, weight, custom_group)
 
     def __str__(self):
@@ -584,19 +585,51 @@ def __str__(self):
         return self.name
 
 class FKTableSpec(TupleComp):
-    def __init__(self, fkpath, cfactors, use_fixed_predictions=False, fixed_predictions_path=None):
+    def __init__(self, fkpath, cfactors, use_fixed_predictions=False, fixed_predictions_path=None, theory_meta=None, legacy=True):
         self.fkpath = fkpath
-        self.cfactors = cfactors
+        self.cfactors = cfactors if cfactors is not None else []
+        self.legacy = legacy
         self.use_fixed_predictions = use_fixed_predictions
         self.fixed_predictions_path = fixed_predictions_path
+
+        # if not isinstance(fkpath, (tuple, list)):
+        #     self.legacy = True
+        # else:
+        #     fkpath = tuple(fkpath)
+
+        if not self.legacy:
+            fkpath = tuple([fkpath])
+        self.theory_meta = theory_meta
+
+        # For non-legacy theory, add the metadata since it defines how the theory is to be loaded
+        # and thus, it should also define the hash of the class
+        # if not self.legacy:
+        #     super().__init__(fkpath, cfactors, self.metadata)
+        # else:
         super().__init__(fkpath, cfactors)
+
 
     #NOTE: We cannot do this because Fkset owns the fktable, and trying
     #to reuse the loaded one fails after it gets deleted.
     #@functools.lru_cache()
     def load(self):
         return FKTable(str(self.fkpath), [str(factor) for factor in self.cfactors])
 
+
+    def load_cfactors(self):
+        """Each of the sub-fktables that form the complete FKTable can have several cfactors
+        applied to it. This function uses ``parse_cfactor`` to make them into CFactorData
+        """
+        from validphys.fkparser import parse_cfactor
+        if self.legacy:
+            raise NotImplementedError("cfactor loading from spec not implemented for old theories")
+        cfacs = []
+        for c in self.cfactors:
+            with open(c, "rb") as f:
+                cfacs.append(parse_cfactor(f))
+            f.close()    
+        return [cfacs]
+
 class PositivitySetSpec(DataSetSpec):
     """Extends DataSetSpec to work around the particularities of the positivity datasets"""
 

diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
@@ -5,11 +5,11 @@
 
 """
 import dataclasses
-from typing import Dict
+import yaml
 
 import numpy as np
 import pandas as pd
-
+from validphys.commondatawriter import write_commondata_to_file, write_systype_to_file
 
 @dataclasses.dataclass(eq=False)
 class FKTableData:
@@ -97,6 +97,68 @@ def with_cuts(self, cuts):
         newsigma = self.sigma.loc[cuts]
         return dataclasses.replace(self, ndata=newndata, sigma=newsigma)
 
+    def get_np_fktable(self):
+        """Returns the fktable as a dense numpy array that can be directly
+        manipulated with numpy
+
+        The return shape is:
+            (ndata, nx, nbasis) for DIS
+            (ndata, nx, nx, nbasis) for hadronic
+        where nx is the length of the xgrid
+        and nbasis the number of flavour contributions that contribute
+        """
+        # Read up the shape of the output table
+        ndata = self.ndata
+        nx = len(self.xgrid)
+        nbasis = self.sigma.shape[1]
+
+        if ndata == 0:
+            if self.hadronic:
+                return np.zeros((ndata, nbasis, nx, nx))
+            return np.zeros((ndata, nbasis, nx))
+
+        # Make the dataframe into a dense numpy array
+
+        # First get the data index out of the way
+        # this is necessary because cuts/shifts and for performance reasons
+        # otherwise we will be putting things in a numpy array in very awkward orders
+        ns = self.sigma.unstack(level=("data",), fill_value=0)
+        x1 = ns.index.get_level_values(0)
+
+        if self.hadronic:
+            x2 = ns.index.get_level_values(1)
+            fk_raw = np.zeros((nx, nx, ns.shape[1]))
+            fk_raw[x2, x1, :] = ns.values
+
+            # The output is (ndata, basis, x1, x2)
+            fktable = fk_raw.reshape((nx, nx, nbasis, ndata)).T
+        else:
+            fk_raw = np.zeros((nx, ns.shape[1]))
+            fk_raw[x1, :] = ns.values
+
+            # The output is (ndata, basis, x1)
+            fktable = fk_raw.reshape((nx, nbasis, ndata)).T
+
+        return fktable
+
+
+    @property
+    def luminosity_mapping(self):
+        """Return the flavour combinations that contribute to the fktable
+        in the form of a single array
+
+        The return shape is:
+            (nbasis,) for DIS
+            (nbasis*2,) for hadronic
+        """
+        basis = self.sigma.columns.to_numpy()
+        if self.hadronic:
+            ret = np.zeros(14 * 14, dtype=bool)
+            ret[basis] = True
+            basis = np.array(np.where(ret.reshape(14, 14))).T.reshape(-1)
+        return basis
+
+
 
 @dataclasses.dataclass(eq=False)
 class CFactorData:
@@ -302,3 +364,18 @@ def with_central_value(self, cv):
         tb = self.commondata_table.copy()
         tb["data"] = cv
         return dataclasses.replace(self, commondata_table=tb)
+
+    def export(self, path):
+        """Export the data, and error types
+         Use the same format as libNNPDF:
+
+        - A DATA_<dataset>.dat file with the dataframe of accepted points
+        - A systypes/STYPES_<dataset>.dat file with the error types
+        """
+
+        dat_path = path / f"DATA_{self.setname}.dat"
+        sys_path = path / "systypes" / f"SYSTYPE_{self.setname}_DEFAULT.dat"
+        sys_path.parent.mkdir(exist_ok=True)
+
+        write_systype_to_file(self, sys_path)
+        write_commondata_to_file(self, dat_path)