HEP-PBSP
diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py
@@ -13,6 +13,8 @@
 from validphys.core import peek_commondata_metadata
 from validphys.coredata import CommonData
 
+EXT = "pineappl.lz4"
+
 def load_commondata(spec):
     """
     Load the data corresponding to a CommonDataSpec object.

diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py
@@ -520,7 +520,7 @@ def produce_simu_parameters_linear_combinations(self, simu_parameters=None):
     def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_parameters_scales, n_simu_parameters, simu_parameters_linear_combinations, simu_parameters=None):
         """The mapping that corresponds to the dataset specifications in the
         fit files"""
-        known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions", "contamination"}
+        known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions", "contamination", "new_commondata"}
         try:
             name = dataset["dataset"]
             if not isinstance(name, str):
@@ -530,6 +530,7 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para
                 "'dataset' must be a mapping with " "'dataset' and 'sysnum'"
             )
 
+        new_commondata = dataset.get("new_commondata", False)
         sysnum = dataset.get("sys")
         cfac = dataset.get("cfac", tuple())
         frac = dataset.get("frac", 1)
@@ -572,7 +573,8 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para
             custom_group=custom_group,
             use_fixed_predictions=use_fixed_predictions,
             contamination=contamination,
-            **bsm_data
+            **bsm_data,
+            new_commondata=new_commondata,
         )
 
     def parse_use_fitcommondata(self, do_use: bool):
@@ -759,6 +761,7 @@ def produce_dataset(
         use_fixed_predictions = dataset_input.use_fixed_predictions
         contamination = dataset_input.contamination
         contamination_data = contamination_data
+        new_commondata = dataset_input.new_commondata
 
         try:
             ds = self.loader.check_dataset(
@@ -776,6 +779,7 @@ def produce_dataset(
                 use_fixed_predictions=use_fixed_predictions,
                 contamination=contamination,
                 contamination_data=contamination_data,
+                new_commondata=new_commondata,
             )
         except DataNotFoundError as e:
             raise ConfigError(str(e), name, self.loader.available_datasets)

diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py
@@ -336,7 +336,7 @@ def plot_kinlabels(self):
 class DataSetInput(TupleComp):
     """Represents whatever the user enters in the YAML to specify a
     dataset."""
-    def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, simu_parameters_linear_combinations, use_fixed_predictions, contamination):
+    def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, simu_parameters_linear_combinations, use_fixed_predictions, contamination, new_commondata):
         self.name=name
         self.sys=sys
         self.cfac = cfac
@@ -347,6 +347,7 @@ def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_paramete
         self.simu_parameters_linear_combinations = simu_parameters_linear_combinations
         self.use_fixed_predictions = use_fixed_predictions
         self.contamination = contamination
+        self.new_commondata = new_commondata
         super().__init__(name, sys, cfac, frac, weight, custom_group)
 
     def __str__(self):
@@ -584,19 +585,47 @@ def __str__(self):
         return self.name
 
 class FKTableSpec(TupleComp):
-    def __init__(self, fkpath, cfactors, use_fixed_predictions=False, fixed_predictions_path=None):
+    def __init__(self, fkpath, cfactors, use_fixed_predictions=False, fixed_predictions_path=None, theory_meta=None, legacy=True):
         self.fkpath = fkpath
-        self.cfactors = cfactors
+        self.cfactors = cfactors if cfactors is not None else []
+        self.legacy = legacy
         self.use_fixed_predictions = use_fixed_predictions
         self.fixed_predictions_path = fixed_predictions_path
+
+        # if not isinstance(fkpath, (tuple, list)):
+        #     self.legacy = True
+        # else:
+        #     fkpath = tuple(fkpath)
+
+        if not self.legacy:
+            fkpath = tuple([fkpath])
+        self.theory_meta = theory_meta
+
+        # For non-legacy theory, add the metadata since it defines how the theory is to be loaded
+        # and thus, it should also define the hash of the class
+        # if not self.legacy:
+        #     super().__init__(fkpath, cfactors, self.metadata)
+        # else:
         super().__init__(fkpath, cfactors)
+
 
     #NOTE: We cannot do this because Fkset owns the fktable, and trying
     #to reuse the loaded one fails after it gets deleted.
     #@functools.lru_cache()
     def load(self):
         return FKTable(str(self.fkpath), [str(factor) for factor in self.cfactors])
 
+
+    def load_cfactors(self):
+        """Each of the sub-fktables that form the complete FKTable can have several cfactors
+        applied to it. This function uses ``parse_cfactor`` to make them into CFactorData
+        """
+        from validphys.fkparser import parse_cfactor
+        if self.legacy:
+            raise NotImplementedError("cfactor loading from spec not implemented for old theories")
+
+        return [[parse_cfactor(c.open("rb")) for c in cfacs] for cfacs in self.cfactors]
+
 class PositivitySetSpec(DataSetSpec):
     """Extends DataSetSpec to work around the particularities of the positivity datasets"""
 

diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py
@@ -5,13 +5,13 @@
 
 """
 import dataclasses
-from typing import Dict
+import yaml
 
 from validphys.commondatawriter import write_commondata_to_file, write_systype_to_file
 
 import numpy as np
 import pandas as pd
-
+from validphys.commondatawriter import write_commondata_to_file, write_systype_to_file
 
 @dataclasses.dataclass(eq=False)
 class FKTableData:
@@ -99,6 +99,68 @@ def with_cuts(self, cuts):
         newsigma = self.sigma.loc[cuts]
         return dataclasses.replace(self, ndata=newndata, sigma=newsigma)
 
+    def get_np_fktable(self):
+        """Returns the fktable as a dense numpy array that can be directly
+        manipulated with numpy
+
+        The return shape is:
+            (ndata, nx, nbasis) for DIS
+            (ndata, nx, nx, nbasis) for hadronic
+        where nx is the length of the xgrid
+        and nbasis the number of flavour contributions that contribute
+        """
+        # Read up the shape of the output table
+        ndata = self.ndata
+        nx = len(self.xgrid)
+        nbasis = self.sigma.shape[1]
+
+        if ndata == 0:
+            if self.hadronic:
+                return np.zeros((ndata, nbasis, nx, nx))
+            return np.zeros((ndata, nbasis, nx))
+
+        # Make the dataframe into a dense numpy array
+
+        # First get the data index out of the way
+        # this is necessary because cuts/shifts and for performance reasons
+        # otherwise we will be putting things in a numpy array in very awkward orders
+        ns = self.sigma.unstack(level=("data",), fill_value=0)
+        x1 = ns.index.get_level_values(0)
+
+        if self.hadronic:
+            x2 = ns.index.get_level_values(1)
+            fk_raw = np.zeros((nx, nx, ns.shape[1]))
+            fk_raw[x2, x1, :] = ns.values
+
+            # The output is (ndata, basis, x1, x2)
+            fktable = fk_raw.reshape((nx, nx, nbasis, ndata)).T
+        else:
+            fk_raw = np.zeros((nx, ns.shape[1]))
+            fk_raw[x1, :] = ns.values
+
+            # The output is (ndata, basis, x1)
+            fktable = fk_raw.reshape((nx, nbasis, ndata)).T
+
+        return fktable
+
+
+    @property
+    def luminosity_mapping(self):
+        """Return the flavour combinations that contribute to the fktable
+        in the form of a single array
+
+        The return shape is:
+            (nbasis,) for DIS
+            (nbasis*2,) for hadronic
+        """
+        basis = self.sigma.columns.to_numpy()
+        if self.hadronic:
+            ret = np.zeros(14 * 14, dtype=bool)
+            ret[basis] = True
+            basis = np.array(np.where(ret.reshape(14, 14))).T.reshape(-1)
+        return basis
+
+
 
 @dataclasses.dataclass(eq=False)
 class CFactorData:

diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py
@@ -29,6 +29,7 @@
 import pandas as pd
 
 from validphys.coredata import FKTableData, CFactorData
+from validphys.pineparser import pineappl_reader
 
 
 
@@ -53,8 +54,13 @@ class GridInfo:
 def load_fktable(spec):
     """Load the data corresponding to a FKSpec object. The cfactors
     will be applied to the grid."""
-    with open_fkpath(spec.fkpath) as handle:
-        tabledata = parse_fktable(handle)
+    if spec.legacy:
+        with open_fkpath(spec.fkpath) as handle:
+            tabledata = parse_fktable(handle)
+
+    else:
+        tabledata = pineappl_reader(spec)
+
     if not spec.cfactors:
         return tabledata
 

diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py
@@ -32,6 +32,7 @@
                             InternalCutsWrapper, HyperscanSpec)
 from validphys.utils import tempfile_cleaner
 from validphys import lhaindex
+from validphys.pineparser import parse_theory_meta
 
 DEFAULT_NNPDF_PROFILE_PATH = f"{sys.prefix}/share/NNPDF/nnprofile.yaml"
 
@@ -351,7 +352,7 @@ def get_commondata(self, setname, sysnum):
         return cd.load()
 
     #   @functools.lru_cache()
-    def check_fktable(self, theoryID, setname, cfac, use_fixed_predictions=False):
+    def check_fktable(self, theoryID, setname, cfac, use_fixed_predictions=False, new_commondata=False):
         _, theopath = self.check_theoryID(theoryID)
 
         if use_fixed_predictions:
@@ -362,14 +363,45 @@ def check_fktable(self, theoryID, setname, cfac, use_fixed_predictions=False):
             fixed_predictions_path = theopath/ 'simu_factors' / ('SIMU_%s.yaml' % setname)
             cfactors = self.check_cfactor(theoryID, setname, cfac)
             return FKTableSpec(fkpath, cfactors, use_fixed_predictions=True, fixed_predictions_path=fixed_predictions_path)
+
+        # use different file name for the FK table if the commondata is new
+        if new_commondata:
+            # Need to pass a TheoryMeta object to FKTableSpec
+            path_metadata = theopath / 'fastkernel' / f'{setname}_metadata.yaml'
+            if not path_metadata.exists():
+                raise InconsistentMetaDataError(f"Could not find '_metadata.yaml' file for set {setname}."
+                                                f"File '{path_metadata}' not found.")
+            # get observable name from the setname
+            with open(path_metadata, 'r') as f:
+                metadata = yaml.safe_load(f)
+            # NOTE: write a "_metadata.yaml" file for each observable (then `metadata["implemented_observables"][0]` makes sense)
+            fktables = metadata["implemented_observables"][0]["theory"]["FK_tables"][0]
+            fkpath = tuple([theopath/ 'fastkernel' / (f'{fktable}.pineappl.lz4') for fktable in fktables])
+            for path in fkpath:
+                if not path.exists():
+                    raise FKTableNotFound(("Could not find FKTable for set '%s'. "
+                    "File '%s' not found") % (setname, path) )
+        else:
+            fkpath = theopath/ 'fastkernel' / ('FK_%s.dat' % setname)
 
-        fkpath = theopath/ 'fastkernel' / ('FK_%s.dat' % setname)
-        if not fkpath.exists():
-          raise FKTableNotFound(("Could not find FKTable for set '%s'. "
-          "File '%s' not found") % (setname, fkpath) )
+            if not fkpath.exists():
+                raise FKTableNotFound(("Could not find FKTable for set '%s'. "
+                "File '%s' not found") % (setname, fkpath) )
 
         cfactors = self.check_cfactor(theoryID, setname, cfac)
-        return FKTableSpec(fkpath, cfactors)
+        if new_commondata:
+
+            common_prefix = os.path.commonprefix([metadata['setname'], setname])
+
+            observable_name = setname[len(common_prefix):]
+            if observable_name.startswith('_'):
+                observable_name = observable_name[1:]
+
+            theory_meta = parse_theory_meta(path_metadata, observable_name=observable_name)        
+
+            return FKTableSpec(fkpath, cfactors, theory_meta=theory_meta, legacy=False)
+        else:
+            return FKTableSpec(fkpath, cfactors)
 
     def check_compound(self, theoryID, setname, cfac):
         thid, theopath = self.check_theoryID(theoryID)
@@ -549,6 +581,7 @@ def check_dataset(
         use_fixed_predictions=False,
         contamination=None,
         contamination_data=None,
+        new_commondata=False,
     ):
 
         if not isinstance(theoryid, TheoryIDSpec):
@@ -561,7 +594,7 @@ def check_dataset(
         try:
             fkspec, op = self.check_compound(theoryno, name, cfac)
         except CompoundNotFound:
-            fkspec = self.check_fktable(theoryno, name, cfac, use_fixed_predictions=use_fixed_predictions)
+            fkspec = self.check_fktable(theoryno, name, cfac, use_fixed_predictions=use_fixed_predictions, new_commondata=new_commondata)
             op = None
 
         #Note this is simply for convenience when scripting. The config will

diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py
@@ -180,11 +180,20 @@ def _mask_fk_tables(dataset_dicts, tr_masks):
         vl_fks = []
         ex_fks = []
         vl_mask = ~tr_mask
+
         for fktable_dict in dataset_dict["fktables"]:
-            tr_fks.append(fktable_dict["fktable"][tr_mask])
-            vl_fks.append(fktable_dict["fktable"][vl_mask])
-            ex_fks.append(fktable_dict.get("fktable"))
-            dataset_dict['ds_tr_mask'] = tr_mask
+            if not dataset_dict["use_fixed_predictions"]:
+                tr_fks.append(fktable_dict["fktable"][tr_mask])
+                vl_fks.append(fktable_dict["fktable"][vl_mask])
+                ex_fks.append(fktable_dict.get("fktable"))
+                dataset_dict['ds_tr_mask'] = tr_mask
+            # note: fixed observables have a fake fktable
+            else:
+                tr_fks.append(fktable_dict["fktable"])
+                vl_fks.append([])
+                ex_fks.append(fktable_dict.get("fktable"))
+                dataset_dict['ds_tr_mask'] = tr_mask
+
         dataset_dict["tr_fktables"] = tr_fks
         dataset_dict["vl_fktables"] = vl_fks
         dataset_dict["ex_fktables"] = ex_fks
@@ -243,13 +252,13 @@ def fitting_data_dict(
     # TODO: Plug in the python data loading when available. Including but not
     # limited to: central values, ndata, replica generation, covmat construction
     if data.datasets:
-        try:
-            spec_c = data.load()
-        except:
-            breakpoint()
-        ndata = spec_c.GetNData()
-        expdata_true = spec_c.get_cv().reshape(1, ndata)
-        datasets = common_data_reader_experiment(spec_c, data)
+        ndata = sum([ds.commondata.load_commondata(cuts=ds.cuts).ndata for ds in data.datasets])
+        expdata_true = np.array([])
+        for ds in data.datasets:
+            expdata_true = np.append(expdata_true, ds.commondata.load_commondata(cuts=ds.cuts).central_values)
+        expdata_true = expdata_true.reshape(1, ndata)
+        # expdata_true = np.array([ds.commondata.load_commondata(cuts=ds.cuts).central_values for ds in data.datasets]).reshape(1,ndata)
+        datasets = common_data_reader_experiment(data)
         for i in range(len(data.datasets)):
             if data.datasets[i].use_fixed_predictions:
                 datasets[i]['use_fixed_predictions'] = True