diff --git a/validphys2/src/validphys/commondataparser.py b/validphys2/src/validphys/commondataparser.py index ab2cbaf8c..9fd198ca5 100644 --- a/validphys2/src/validphys/commondataparser.py +++ b/validphys2/src/validphys/commondataparser.py @@ -13,6 +13,8 @@ from validphys.core import peek_commondata_metadata from validphys.coredata import CommonData +EXT = "pineappl.lz4" + def load_commondata(spec): """ Load the data corresponding to a CommonDataSpec object. @@ -53,7 +55,7 @@ def parse_commondata(commondatafile, systypefile, setname): commondatatable.columns = commondataheader commondatatable.set_index("entry", inplace=True) ndata = len(commondatatable) - commondataproc = commondatatable["process"][1] + commondataproc = commondatatable["process"].iloc[0] # Check for consistency with commondata metadata cdmetadata = peek_commondata_metadata(commondatafile) if (setname, nsys, ndata) != attrgetter('name', 'nsys', 'ndata')(cdmetadata): diff --git a/validphys2/src/validphys/commondatawriter.py b/validphys2/src/validphys/commondatawriter.py new file mode 100644 index 000000000..87578fe3f --- /dev/null +++ b/validphys2/src/validphys/commondatawriter.py @@ -0,0 +1,86 @@ +import numpy as np +""" +This module contains functions to write commondata and systypes +tables to files +""" + + +def write_commondata_data(commondata, buffer): + """ + write commondata table to buffer, this can be a memory map, + compressed archive or strings (using for instance StringIO) + + + Parameters + ---------- + + commondata : validphys.coredata.CommonData + + buffer : memory map, compressed archive or strings + example: StringIO object + + + Example + ------- + >>> from validphys.loader import Loader + >>> from io import StringIO + + >>> l = Loader() + >>> cd = l.check_commondata("NMC").load_commondata_instance() + >>> sio = StringIO() + >>> write_commondata_data(cd,sio) + >>> print(sio.getvalue()) + + """ + header = f"{commondata.setname} {commondata.nsys} {commondata.ndata}\n" + buffer.write(header) + commondata.commondata_table.index = np.arange(1, len(commondata.commondata_table)+1) + commondata.commondata_table.to_csv(buffer, float_format="%20.12e", sep="\t", header=None) + + +def write_commondata_to_file(commondata, path): + """ + write commondata table to file + """ + with open(path, "w") as file: + write_commondata_data(commondata, file) + + +def write_systype_data(commondata, buffer): + """ + write systype table to buffer, this can be a memory map, + compressed archive or strings (using for instance StringIO) + + + Parameters + ---------- + + commondata : validphys.coredata.CommonData + + buffer : memory map, compressed archive or strings + example: StringIO object + + + Example + ------- + >>> from validphys.loader import Loader + >>> from io import StringIO + + >>> l = Loader() + >>> cd = l.check_commondata("NMC").load_commondata_instance() + >>> sio = StringIO() + >>> write_systype_data(cd,sio) + >>> print(sio.getvalue()) + + """ + header = f"{commondata.nsys}\n" + buffer.write(header) + commondata.systype_table.to_csv(buffer, sep="\t", header=None) + + +def write_systype_to_file(commondata, path): + """ + write systype table to file + """ + with open(path, "w") as file: + write_systype_data(commondata, file) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index bad0bd964..8eb6f863e 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -512,7 +512,7 @@ def produce_simu_parameters_linear_combinations(self, simu_parameters=None): def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_parameters_scales, n_simu_parameters, simu_parameters_linear_combinations, simu_parameters=None): """The mapping that corresponds to the dataset specifications in the fit files""" - known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions", "contamination"} + known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions", "contamination", "new_commondata"} try: name = dataset["dataset"] if not isinstance(name, str): @@ -522,6 +522,7 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para "'dataset' must be a mapping with " "'dataset' and 'sysnum'" ) + new_commondata = dataset.get("new_commondata", False) sysnum = dataset.get("sys") cfac = dataset.get("cfac", tuple()) frac = dataset.get("frac", 1) @@ -564,7 +565,8 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para custom_group=custom_group, use_fixed_predictions=use_fixed_predictions, contamination=contamination, - **bsm_data + **bsm_data, + new_commondata=new_commondata, ) def parse_use_fitcommondata(self, do_use: bool): @@ -751,6 +753,7 @@ def produce_dataset( use_fixed_predictions = dataset_input.use_fixed_predictions contamination = dataset_input.contamination contamination_data = contamination_data + new_commondata = dataset_input.new_commondata try: ds = self.loader.check_dataset( @@ -768,6 +771,7 @@ def produce_dataset( use_fixed_predictions=use_fixed_predictions, contamination=contamination, contamination_data=contamination_data, + new_commondata=new_commondata, ) except DataNotFoundError as e: raise ConfigError(str(e), name, self.loader.available_datasets) diff --git a/validphys2/src/validphys/convolution.py b/validphys2/src/validphys/convolution.py index b0dec81fd..f3cda1e9e 100644 --- a/validphys2/src/validphys/convolution.py +++ b/validphys2/src/validphys/convolution.py @@ -105,7 +105,7 @@ def _predictions(dataset, pdf, fkfunc): all replicas, central, etc) according to the provided ``fkfunc``, which should have the same interface as e.g. ``fk_predictions``. """ - opfunc = OP[dataset.op] + opfunc = OP[dataset.op.upper()] if dataset.cuts is None: raise PredictionsRequireCutsError( "FKTables do not always generate predictions for some datapoints " @@ -119,17 +119,17 @@ def _predictions(dataset, pdf, fkfunc): # predictions instead. all_predictions = [] for fk in dataset.fkspecs: - if not fk.use_fixed_predictions: - all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf)) - else: - with open(fk.fixed_predictions_path, 'rb') as f: - fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed']) - # Now need to reshape it according it to the expected number of predictions - if fkfunc == central_fk_predictions: - all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data'])) - elif fkfunc == fk_predictions: - fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1)) - all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())])) + if not fk.use_fixed_predictions: + all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf)) + else: + with open(fk.fixed_predictions_path, 'rb') as f: + fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed']) + # Now need to reshape it according it to the expected number of predictions + if fkfunc == central_fk_predictions: + all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data'])) + elif fkfunc == fk_predictions: + fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1)) + all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())])) return opfunc(*all_predictions) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index be329c5d9..dbed7af43 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -327,7 +327,7 @@ def load_commondata(self, cuts=None): if cuts is not None: cd = cd.with_cuts(cuts) return cd - + @property def plot_kinlabels(self): return get_plot_kinlabels(self) @@ -336,7 +336,7 @@ def plot_kinlabels(self): class DataSetInput(TupleComp): """Represents whatever the user enters in the YAML to specify a dataset.""" - def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, simu_parameters_linear_combinations, use_fixed_predictions, contamination): + def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, simu_parameters_linear_combinations, use_fixed_predictions, contamination, new_commondata): self.name=name self.sys=sys self.cfac = cfac @@ -347,6 +347,7 @@ def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_paramete self.simu_parameters_linear_combinations = simu_parameters_linear_combinations self.use_fixed_predictions = use_fixed_predictions self.contamination = contamination + self.new_commondata = new_commondata super().__init__(name, sys, cfac, frac, weight, custom_group) def __str__(self): @@ -584,12 +585,29 @@ def __str__(self): return self.name class FKTableSpec(TupleComp): - def __init__(self, fkpath, cfactors, use_fixed_predictions=False, fixed_predictions_path=None): + def __init__(self, fkpath, cfactors, use_fixed_predictions=False, fixed_predictions_path=None, theory_meta=None, legacy=True): self.fkpath = fkpath - self.cfactors = cfactors + self.cfactors = cfactors if cfactors is not None else [] + self.legacy = legacy self.use_fixed_predictions = use_fixed_predictions self.fixed_predictions_path = fixed_predictions_path + + # if not isinstance(fkpath, (tuple, list)): + # self.legacy = True + # else: + # fkpath = tuple(fkpath) + + if not self.legacy: + fkpath = tuple([fkpath]) + self.theory_meta = theory_meta + + # For non-legacy theory, add the metadata since it defines how the theory is to be loaded + # and thus, it should also define the hash of the class + # if not self.legacy: + # super().__init__(fkpath, cfactors, self.metadata) + # else: super().__init__(fkpath, cfactors) + #NOTE: We cannot do this because Fkset owns the fktable, and trying #to reuse the loaded one fails after it gets deleted. @@ -597,6 +615,21 @@ def __init__(self, fkpath, cfactors, use_fixed_predictions=False, fixed_predicti def load(self): return FKTable(str(self.fkpath), [str(factor) for factor in self.cfactors]) + + def load_cfactors(self): + """Each of the sub-fktables that form the complete FKTable can have several cfactors + applied to it. This function uses ``parse_cfactor`` to make them into CFactorData + """ + from validphys.fkparser import parse_cfactor + if self.legacy: + raise NotImplementedError("cfactor loading from spec not implemented for old theories") + cfacs = [] + for c in self.cfactors: + with open(c, "rb") as f: + cfacs.append(parse_cfactor(f)) + f.close() + return [cfacs] + class PositivitySetSpec(DataSetSpec): """Extends DataSetSpec to work around the particularities of the positivity datasets""" diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index edd6023e8..8486269be 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -5,11 +5,11 @@ """ import dataclasses -from typing import Dict +import yaml import numpy as np import pandas as pd - +from validphys.commondatawriter import write_commondata_to_file, write_systype_to_file @dataclasses.dataclass(eq=False) class FKTableData: @@ -97,6 +97,68 @@ def with_cuts(self, cuts): newsigma = self.sigma.loc[cuts] return dataclasses.replace(self, ndata=newndata, sigma=newsigma) + def get_np_fktable(self): + """Returns the fktable as a dense numpy array that can be directly + manipulated with numpy + + The return shape is: + (ndata, nx, nbasis) for DIS + (ndata, nx, nx, nbasis) for hadronic + where nx is the length of the xgrid + and nbasis the number of flavour contributions that contribute + """ + # Read up the shape of the output table + ndata = self.ndata + nx = len(self.xgrid) + nbasis = self.sigma.shape[1] + + if ndata == 0: + if self.hadronic: + return np.zeros((ndata, nbasis, nx, nx)) + return np.zeros((ndata, nbasis, nx)) + + # Make the dataframe into a dense numpy array + + # First get the data index out of the way + # this is necessary because cuts/shifts and for performance reasons + # otherwise we will be putting things in a numpy array in very awkward orders + ns = self.sigma.unstack(level=("data",), fill_value=0) + x1 = ns.index.get_level_values(0) + + if self.hadronic: + x2 = ns.index.get_level_values(1) + fk_raw = np.zeros((nx, nx, ns.shape[1])) + fk_raw[x2, x1, :] = ns.values + + # The output is (ndata, basis, x1, x2) + fktable = fk_raw.reshape((nx, nx, nbasis, ndata)).T + else: + fk_raw = np.zeros((nx, ns.shape[1])) + fk_raw[x1, :] = ns.values + + # The output is (ndata, basis, x1) + fktable = fk_raw.reshape((nx, nbasis, ndata)).T + + return fktable + + + @property + def luminosity_mapping(self): + """Return the flavour combinations that contribute to the fktable + in the form of a single array + + The return shape is: + (nbasis,) for DIS + (nbasis*2,) for hadronic + """ + basis = self.sigma.columns.to_numpy() + if self.hadronic: + ret = np.zeros(14 * 14, dtype=bool) + ret[basis] = True + basis = np.array(np.where(ret.reshape(14, 14))).T.reshape(-1) + return basis + + @dataclasses.dataclass(eq=False) class CFactorData: @@ -302,3 +364,18 @@ def with_central_value(self, cv): tb = self.commondata_table.copy() tb["data"] = cv return dataclasses.replace(self, commondata_table=tb) + + def export(self, path): + """Export the data, and error types + Use the same format as libNNPDF: + + - A DATA_.dat file with the dataframe of accepted points + - A systypes/STYPES_.dat file with the error types + """ + + dat_path = path / f"DATA_{self.setname}.dat" + sys_path = path / "systypes" / f"SYSTYPE_{self.setname}_DEFAULT.dat" + sys_path.parent.mkdir(exist_ok=True) + + write_systype_to_file(self, sys_path) + write_commondata_to_file(self, dat_path) diff --git a/validphys2/src/validphys/filters.py b/validphys2/src/validphys/filters.py index 9c930c8f2..3a9988abf 100644 --- a/validphys2/src/validphys/filters.py +++ b/validphys2/src/validphys/filters.py @@ -93,8 +93,8 @@ def prepare_nnpdf_rng(filterseed:int, rngalgo:int, seed:int): RandomGenerator.InitRNG(rngalgo, seed) RandomGenerator.GetRNG().SetSeed(filterseed) -@check_positive('errorsize') -def filter_closure_data(filter_path, data, t0pdfset, fakenoise, errorsize, prepare_nnpdf_rng): +# @check_positive('errorsize') +def filter_closure_data(filter_path, data, t0pdfset, fakenoise, filterseed, data_index, prepare_nnpdf_rng): """Filter closure data. In addition to cutting data points, the data is generated from an underlying ``t0pdfset``, applying a shift to the data if ``fakenoise`` is ``True``, which emulates the experimental central values @@ -103,12 +103,12 @@ def filter_closure_data(filter_path, data, t0pdfset, fakenoise, errorsize, prepa """ log.info('Filtering closure-test data.') return _filter_closure_data( - filter_path, data, t0pdfset, fakenoise, errorsize) + filter_path, data, t0pdfset, fakenoise, filterseed, data_index) -@check_positive("errorsize") +# @check_positive("errorsize") def filter_closure_data_by_experiment( - filter_path, experiments_data, t0pdfset, fakenoise, errorsize, prepare_nnpdf_rng, + filter_path, experiments_data, t0pdfset, fakenoise, filterseed, data_index, prepare_nnpdf_rng, ): """ Like :py:func:`filter_closure_data` except filters data by experiment. @@ -119,10 +119,21 @@ def filter_closure_data_by_experiment( not reproducible. """ - return [ - _filter_closure_data(filter_path, exp, t0pdfset, fakenoise, errorsize) - for exp in experiments_data - ] + + res = [] + for exp in experiments_data: + experiment_index = data_index[data_index.isin([exp.name], level=0)] + res.append( + _filter_closure_data( + filter_path, + exp, + t0pdfset, + fakenoise, + filterseed, + experiment_index, + ) + ) + return res def filter_real_data(filter_path, data): @@ -164,28 +175,39 @@ def _filter_real_data(filter_path, data): nfull, ncut = _write_ds_cut_data(path, dataset) total_data_points += nfull total_cut_data_points += ncut - dataset.load().Export(str(path)) + dataset.commondata.load_commondata(cuts=dataset.cuts).export(path) return total_data_points, total_cut_data_points -def _filter_closure_data(filter_path, data, fakepdfset, fakenoise, errorsize): +def _filter_closure_data(filter_path, data, fakepdfset, fakenoise, filterseed, data_index): """Filter closure test data.""" total_data_points = 0 total_cut_data_points = 0 - fakeset = fakepdfset.legacy_load() - # Load data, don't cache result - loaded_data = data.load.__wrapped__(data) - # generate level 1 shift if fakenoise - loaded_data.MakeClosure(fakeset, fakenoise) - for j, dataset in enumerate(data.datasets): + + # circular import in validphys.core + from validphys.pseudodata import level0_commondata_wc, make_level1_data + + closure_data = level0_commondata_wc(data, fakepdfset) + all_raw_commondata = {} + + for dataset in data.datasets: path = filter_path / dataset.name nfull, ncut = _write_ds_cut_data(path, dataset) total_data_points += nfull total_cut_data_points += ncut - loaded_ds = loaded_data.GetSet(j) - if errorsize != 1.0: - loaded_ds.RescaleErrors(errorsize) - loaded_ds.Export(str(path)) + all_raw_commondata[dataset.name] = dataset.commondata.load_commondata() + + + if fakenoise: + closure_data = make_level1_data(closure_data, filterseed, data_index) + log.info("Writing Level1 data") + else: + log.info("Writing Level0 data") + + for cd in closure_data: + path = filter_path / cd.setname + cd.export(path) + return total_data_points, total_cut_data_points diff --git a/validphys2/src/validphys/fkparser.py b/validphys2/src/validphys/fkparser.py index e2e4077db..da4779235 100644 --- a/validphys2/src/validphys/fkparser.py +++ b/validphys2/src/validphys/fkparser.py @@ -29,6 +29,7 @@ import pandas as pd from validphys.coredata import FKTableData, CFactorData +from validphys.pineparser import pineappl_reader @@ -53,8 +54,13 @@ class GridInfo: def load_fktable(spec): """Load the data corresponding to a FKSpec object. The cfactors will be applied to the grid.""" - with open_fkpath(spec.fkpath) as handle: - tabledata = parse_fktable(handle) + if spec.legacy: + with open_fkpath(spec.fkpath) as handle: + tabledata = parse_fktable(handle) + + else: + tabledata = pineappl_reader(spec) + if not spec.cfactors: return tabledata diff --git a/validphys2/src/validphys/loader.py b/validphys2/src/validphys/loader.py index 1c911b3d8..e2360826a 100644 --- a/validphys2/src/validphys/loader.py +++ b/validphys2/src/validphys/loader.py @@ -32,6 +32,7 @@ InternalCutsWrapper, HyperscanSpec) from validphys.utils import tempfile_cleaner from validphys import lhaindex +from validphys.pineparser import parse_theory_meta, TheoryMeta DEFAULT_NNPDF_PROFILE_PATH = f"{sys.prefix}/share/NNPDF/nnprofile.yaml" @@ -351,7 +352,7 @@ def get_commondata(self, setname, sysnum): return cd.load() # @functools.lru_cache() - def check_fktable(self, theoryID, setname, cfac, use_fixed_predictions=False): + def check_fktable(self, theoryID, setname, cfac, use_fixed_predictions=False, new_commondata=False, is_compound=False): _, theopath = self.check_theoryID(theoryID) if use_fixed_predictions: @@ -362,40 +363,90 @@ def check_fktable(self, theoryID, setname, cfac, use_fixed_predictions=False): fixed_predictions_path = theopath/ 'simu_factors' / ('SIMU_%s.yaml' % setname) cfactors = self.check_cfactor(theoryID, setname, cfac) return FKTableSpec(fkpath, cfactors, use_fixed_predictions=True, fixed_predictions_path=fixed_predictions_path) - - fkpath = theopath/ 'fastkernel' / ('FK_%s.dat' % setname) - if not fkpath.exists(): - raise FKTableNotFound(("Could not find FKTable for set '%s'. " - "File '%s' not found") % (setname, fkpath) ) - + cfactors = self.check_cfactor(theoryID, setname, cfac) - return FKTableSpec(fkpath, cfactors) + + # use different file name for the FK table if the commondata is new + if new_commondata: + # Need to pass a TheoryMeta object to FKTableSpec + path_metadata = theopath / 'fastkernel' / f'{setname}_metadata.yaml' + if not path_metadata.exists(): + raise InconsistentMetaDataError(f"Could not find '_metadata.yaml' file for set {setname}." + f"File '{path_metadata}' not found.") + # get observable name from the setname + with open(path_metadata, 'r') as f: + metadata = yaml.safe_load(f) + # NOTE: write a "_metadata.yaml" file for each observable (then `metadata["implemented_observables"][0]` makes sense) + fktables = metadata["implemented_observables"][0]["theory"]["FK_tables"][0] + fkpath = tuple([theopath/ 'fastkernel' / (f'{fktable}.pineappl.lz4') for fktable in fktables]) + for path in fkpath: + if not path.exists(): + raise FKTableNotFound(("Could not find FKTable for set '%s'. " + "File '%s' not found") % (setname, path) ) + + common_prefix = os.path.commonprefix([metadata['setname'], setname]) + + observable_name = setname[len(common_prefix):] + if observable_name.startswith('_'): + observable_name = observable_name[1:] + if is_compound: + theory_meta = TheoryMeta(FK_tables=[fkpath], operation="NULL", conversion_factor=1., shifts=None, normalization=None, comment=None) + else: + theory_meta = parse_theory_meta(path_metadata, observable_name=observable_name) + + return FKTableSpec(fkpath, cfactors, + theory_meta=theory_meta, + legacy=False) + else: + fkpath = theopath/ 'fastkernel' / ('FK_%s.dat' % setname) + + if not fkpath.exists(): + raise FKTableNotFound(("Could not find FKTable for set '%s'. " + "File '%s' not found") % (setname, fkpath) ) + + return FKTableSpec(fkpath, cfactors) + - def check_compound(self, theoryID, setname, cfac): + def check_compound(self, theoryID, setname, cfac, new_commondata=False): thid, theopath = self.check_theoryID(theoryID) compound_spec_path = theopath / 'compound' / ('FK_%s-COMPOUND.dat' % setname) - try: - with compound_spec_path.open() as f: - #Drop first line with comment - next(f) - txt = f.read() - except FileNotFoundError as e: - msg = ("Could not find COMPOUND set '%s' for theory %d: %s" % - (setname, int(thid), e)) - raise CompoundNotFound(msg) - #This is a little bit funny, but is the least amount of thinking... - yaml_format = 'FK:\n' + re.sub('FK:', ' - ', txt) - data = yaml.safe_load(yaml_format) - #we have to split out 'FK_' the extension to get a name consistent - #with everything else - try: - tables = [self.check_fktable(theoryID, name[3:-4], cfac) - for name in data['FK']] - except FKTableNotFound as e: - raise LoadFailedError( - f"Incorrect COMPOUND file '{compound_spec_path}'. " - f"Searching for non-existing FKTable:\n{e}") from e - op = data['OP'] + if new_commondata: + # Need to pass a TheoryMeta object to FKTableSpec + path_metadata = theopath / 'fastkernel' / f'{setname}_metadata.yaml' + if not path_metadata.exists(): + raise InconsistentMetaDataError(f"Could not find '_metadata.yaml' file for set {setname}." + f"File '{path_metadata}' not found.") + # get observable name from the setname + with open(path_metadata, 'r') as f: + metadata = yaml.safe_load(f) + op = metadata["implemented_observables"][0]["theory"]["operation"] + if op.upper() == "NULL": + raise CompoundNotFound + names = [tab[0] for tab in metadata["implemented_observables"][0]["theory"]["FK_tables"]] + tables = [self.check_fktable(theoryID, name, cfac, new_commondata=new_commondata, is_compound=True) for name in names] + else: + try: + with compound_spec_path.open() as f: + #Drop first line with comment + next(f) + txt = f.read() + except FileNotFoundError as e: + msg = ("Could not find COMPOUND set '%s' for theory %d: %s" % + (setname, int(thid), e)) + raise CompoundNotFound(msg) + #This is a little bit funny, but is the least amount of thinking... + yaml_format = 'FK:\n' + re.sub('FK:', ' - ', txt) + data = yaml.safe_load(yaml_format) + #we have to split out 'FK_' the extension to get a name consistent + #with everything else + try: + tables = [self.check_fktable(theoryID, name[3:-4], cfac) + for name in data['FK']] + except FKTableNotFound as e: + raise LoadFailedError( + f"Incorrect COMPOUND file '{compound_spec_path}'. " + f"Searching for non-existing FKTable:\n{e}") from e + op = data['OP'] return tuple(tables), op @@ -549,20 +600,27 @@ def check_dataset( use_fixed_predictions=False, contamination=None, contamination_data=None, + new_commondata=False, ): if not isinstance(theoryid, TheoryIDSpec): theoryid = self.check_theoryID(theoryid) - theoryno, _ = theoryid + theoryno, theopath = theoryid commondata = self.check_commondata( name, sysnum, use_fitcommondata=use_fitcommondata, fit=fit) try: - fkspec, op = self.check_compound(theoryno, name, cfac) + fkspec, op = self.check_compound(theoryno, name, cfac, new_commondata=new_commondata) except CompoundNotFound: - fkspec = self.check_fktable(theoryno, name, cfac, use_fixed_predictions=use_fixed_predictions) + fkspec = self.check_fktable(theoryno, name, cfac, use_fixed_predictions=use_fixed_predictions, new_commondata=new_commondata) op = None + + if new_commondata: + path_metadata = theopath / 'fastkernel' / f'{name}_metadata.yaml' + with open(path_metadata, "r") as stream: + metadata_card = yaml.safe_load(stream=stream) + op = metadata_card["implemented_observables"][0]["theory"]["operation"] #Note this is simply for convenience when scripting. The config will #construct the actual Cuts object by itself diff --git a/validphys2/src/validphys/n3fit_data.py b/validphys2/src/validphys/n3fit_data.py index 6ac06e872..d7ed1b43c 100644 --- a/validphys2/src/validphys/n3fit_data.py +++ b/validphys2/src/validphys/n3fit_data.py @@ -180,11 +180,20 @@ def _mask_fk_tables(dataset_dicts, tr_masks): vl_fks = [] ex_fks = [] vl_mask = ~tr_mask + for fktable_dict in dataset_dict["fktables"]: - tr_fks.append(fktable_dict["fktable"][tr_mask]) - vl_fks.append(fktable_dict["fktable"][vl_mask]) - ex_fks.append(fktable_dict.get("fktable")) - dataset_dict['ds_tr_mask'] = tr_mask + if not dataset_dict["use_fixed_predictions"]: + tr_fks.append(fktable_dict["fktable"][tr_mask]) + vl_fks.append(fktable_dict["fktable"][vl_mask]) + ex_fks.append(fktable_dict.get("fktable")) + dataset_dict['ds_tr_mask'] = tr_mask + # note: fixed observables have a fake fktable + else: + tr_fks.append(fktable_dict["fktable"]) + vl_fks.append([]) + ex_fks.append(fktable_dict.get("fktable")) + dataset_dict['ds_tr_mask'] = tr_mask + dataset_dict["tr_fktables"] = tr_fks dataset_dict["vl_fktables"] = vl_fks dataset_dict["ex_fktables"] = ex_fks @@ -243,13 +252,13 @@ def fitting_data_dict( # TODO: Plug in the python data loading when available. Including but not # limited to: central values, ndata, replica generation, covmat construction if data.datasets: - try: - spec_c = data.load() - except: - breakpoint() - ndata = spec_c.GetNData() - expdata_true = spec_c.get_cv().reshape(1, ndata) - datasets = common_data_reader_experiment(spec_c, data) + ndata = sum([ds.commondata.load_commondata(cuts=ds.cuts).ndata for ds in data.datasets]) + expdata_true = np.array([]) + for ds in data.datasets: + expdata_true = np.append(expdata_true, ds.commondata.load_commondata(cuts=ds.cuts).central_values) + expdata_true = expdata_true.reshape(1, ndata) + # expdata_true = np.array([ds.commondata.load_commondata(cuts=ds.cuts).central_values for ds in data.datasets]).reshape(1,ndata) + datasets = common_data_reader_experiment(data) for i in range(len(data.datasets)): if data.datasets[i].use_fixed_predictions: datasets[i]['use_fixed_predictions'] = True diff --git a/validphys2/src/validphys/n3fit_data_utils.py b/validphys2/src/validphys/n3fit_data_utils.py index 243fdce0e..801e663f7 100644 --- a/validphys2/src/validphys/n3fit_data_utils.py +++ b/validphys2/src/validphys/n3fit_data_utils.py @@ -5,7 +5,7 @@ """ import numpy as np import yaml -from validphys.fkparser import parse_cfactor +from validphys.fkparser import parse_cfactor, load_fktable from validphys.coredata import CFactorData @@ -62,6 +62,50 @@ def fk_parser(fk, is_hadronic=False): # reshape fktable = fktable_array.reshape(shape_out) + dict_out = { + "ndata": ndata, + "nbasis": nbasis, + "nonzero": nbasis, + "basis": basis, + "nx": nx, + "xgrid": xgrid, + "fktable": fktable, + } + + return dict_out + +def new_fk_parser(fkspec, cuts, is_hadronic=False): + """ + # Arguments: + - `fkspec`: fkspec object + + # Return: + - `dict_out`: dictionary with all information about the fktable + - 'xgrid' + - 'nx' + - 'ndata' + - 'basis' + - 'fktable' + """ + # for fixed predictions the same fake fktable is always loaded + if fkspec.use_fixed_predictions: + fktable_data = load_fktable(fkspec) + elif cuts: + fktable_data = load_fktable(fkspec).with_cuts(cuts) + else: + fktable_data = load_fktable(fkspec) + ndata = fktable_data.ndata + xgrid_flat = fktable_data.xgrid + nx = len(xgrid_flat) + + xgrid = xgrid_flat.reshape(1, nx) + + # n of active flavours + basis = fktable_data.luminosity_mapping + nbasis = len(basis) + + fktable = fktable_data.get_np_fktable() + dict_out = { "ndata": ndata, "nbasis": nbasis, @@ -73,6 +117,7 @@ def fk_parser(fk, is_hadronic=False): } return dict_out + def parse_simu_parameters_names_CF(simu_parameters_names_CF, simu_parameters_linear_combinations, cuts): """ Returns a dictionary containing the bsm k-factor corrections @@ -125,7 +170,7 @@ def parse_simu_parameters_names_CF(simu_parameters_names_CF, simu_parameters_lin return name_cfac_map -def common_data_reader_dataset(dataset_c, dataset_spec): +def common_data_reader_dataset(dataset_spec): """ Import fktable, common data and experimental data for the given data_name @@ -149,19 +194,28 @@ def common_data_reader_dataset(dataset_c, dataset_spec): instead of the dictionary object that model_gen needs """ cuts = dataset_spec.cuts - how_many = dataset_c.GetNSigma() dict_fktables = [] - for i in range(how_many): - fktable = dataset_c.GetFK(i) - dict_fktables.append(fk_parser(fktable, dataset_c.IsHadronic())) + + hadronic = load_fktable(dataset_spec.fkspecs[0]).hadronic + # check that all fkspecs have the same hadronicity + for fkspec in dataset_spec.fkspecs: + if hadronic != load_fktable(fkspec).hadronic: + raise ValueError("All fkspecs in a dataset must have the same hadronicity") + + for fkspec in dataset_spec.fkspecs: + dict_fktables.append(new_fk_parser(fkspec, cuts, hadronic)) + + # for i in range(how_many): + # fktable = dataset_c.GetFK(i) + # dict_fktables.append(fk_parser(fktable, dataset_c.IsHadronic())) dataset_dict = { "fktables": dict_fktables, - "hadronic": dataset_c.IsHadronic(), + "hadronic": hadronic, "operation": dataset_spec.op, - "name": dataset_c.GetSetName(), + "name": str(dataset_spec), "frac": dataset_spec.frac, - "ndata": dataset_c.GetNData(), + "ndata": dataset_spec.commondata.load_commondata(cuts=dataset_spec.cuts).ndata, "simu_parameters_names_CF": parse_simu_parameters_names_CF(dataset_spec.simu_parameters_names_CF, dataset_spec.simu_parameters_linear_combinations, cuts), "simu_parameters_names": dataset_spec.simu_parameters_names, } @@ -169,21 +223,20 @@ def common_data_reader_dataset(dataset_c, dataset_spec): return [dataset_dict] -def common_data_reader_experiment(experiment_c, experiment_spec): +def common_data_reader_experiment(experiment_spec): """ Wrapper around the experiments. Loop over all datasets in an experiment, calls common_data_reader on them and return a list with the content. # Arguments: - - `experiment_c`: c representation of the experiment object - `experiment_spec`: python representation of the experiment object # Returns: - `[parsed_datasets]`: a list of dictionaries output from `common_data_reader_dataset` """ parsed_datasets = [] - for dataset_c, dataset_spec in zip(experiment_c.DataSets(), experiment_spec.datasets): - parsed_datasets += common_data_reader_dataset(dataset_c, dataset_spec) + for dataset_spec in experiment_spec.datasets: + parsed_datasets += common_data_reader_dataset(dataset_spec) return parsed_datasets @@ -194,7 +247,9 @@ def positivity_reader(pos_spec): pos_c = pos_spec.load() ndata = pos_c.GetNData() - parsed_set = [fk_parser(pos_c, pos_c.IsHadronic())] + # assuming that all positivity sets have only one fktable + # parsed_set = [fk_parser(pos_c, pos_c.IsHadronic())] + parsed_set = [new_fk_parser(pos_spec.fkspecs[0], cuts=False, is_hadronic=pos_c.IsHadronic())] pos_sets = [ { diff --git a/validphys2/src/validphys/pineparser.py b/validphys2/src/validphys/pineparser.py new file mode 100644 index 000000000..f1f4914c8 --- /dev/null +++ b/validphys2/src/validphys/pineparser.py @@ -0,0 +1,293 @@ +""" + Loader for the pineappl-based FKTables + + The FKTables for pineappl have ``pineappl.lz4`` and can be utilized + directly with the ``pineappl`` cli as well as read with ``pineappl.fk_table`` +""" + +import logging + +import numpy as np +import pandas as pd +import dataclasses +import yaml + +from validphys.commondataparser import EXT +from validphys.coredata import FKTableData + +log = logging.getLogger(__name__) + + +@dataclasses.dataclass(frozen=True) +class TheoryMeta: + """Contains the necessary information to load the associated fktables + + The theory metadata must always contain a key ``FK_tables`` which defines + the fktables to be loaded. + The ``FK_tables`` is organized as a double list such that: + + The inner list is concatenated + In practice these are different fktables that might refer to the same observable but + that are divided in subgrids for practical reasons. + The outer list instead are the operands for whatever operation needs to be computed + in order to match the experimental data. + + In addition there are other flags that can affect how the fktables are read or used: + - operation: defines the operation to apply to the outer list + - shifts: mapping with the single fktables and their respective shifts + useful to create "gaps" so that the fktables and the respective experimental data + are ordered in the same way (for instance, when some points are missing from a grid) + + This class is inmutable, what is read from the commondata metadata should be considered final + + Example + ------- + >>> from validphys.commondataparser import TheoryMeta + ... from validobj import parse_input + ... from reportengine.compat import yaml + ... theory_raw = ''' + ... FK_tables: + ... - - fk1 + ... - - fk2 + ... - fk3 + ... operation: ratio + ... ''' + ... theory = yaml.safe_load(theory_raw) + ... parse_input(theory, TheoryMeta) + TheoryMeta(FK_tables=[['fk1'], ['fk2', 'fk3']], operation='RATIO', shifts = None, conversion_factor=1.0, comment=None, normalization=None)) + """ + + FK_tables: list[tuple] + operation: str + conversion_factor: float + shifts: None + normalization: None + comment: None + + +def parse_theory_meta(metadata_path, observable_name): + """ + From the metadata.yaml file parses something similar to validphys.commondataparser.TheoryMeta. + This is then needed by the FKspec taken by the pineappl_reader function. + """ + + with open(metadata_path, "r") as f: + meta_card = yaml.safe_load(f) + + # Get the theory metadata + for obs in meta_card['implemented_observables']: + if obs['observable_name'] == observable_name: + obs_metadata = obs + + # TODO: still figure out how to fill shifts and normalization + th_meta = TheoryMeta( + FK_tables= obs_metadata['theory']['FK_tables'], + operation=obs_metadata['theory']['operation'], + conversion_factor=obs_metadata['theory']['conversion_factor'], + shifts=None, + normalization=None, + comment=None + ) + + return th_meta + + +def _pinelumi_to_columns(pine_luminosity, hadronic): + """Makes the pineappl luminosity into the column indices of a dataframe + These corresponds to the indices of a flattened (14x14) matrix for hadronic observables + and the non-zero indices of the 14-flavours for DIS + + Parameters + ---------- + pine_luminosity: list(tuple(int)) + list with a pair of flavours per channel + hadronic: bool + flag for hadronic / DIS observables + + Returns + ------- + list(int): list of labels for the columns + """ + evol_basis_pids = tuple( + [22, 100, 21, 200] + + [200 + n**2 - 1 for n in range(2, 6 + 1)] + + [100 + n**2 - 1 for n in range(2, 6 + 1)] + ) + flav_size = len(evol_basis_pids) + columns = [] + if hadronic: + for i, j in pine_luminosity: + idx = evol_basis_pids.index(i) + jdx = evol_basis_pids.index(j) + columns.append(flav_size * idx + jdx) + else: + # The proton might come from both sides + try: + columns = [evol_basis_pids.index(i) for _, i in pine_luminosity] + except ValueError: + columns = [evol_basis_pids.index(i) for i, _ in pine_luminosity] + return columns + + +def pineappl_reader(fkspec): + """ + Receives a fkspec, which contains the path to the fktables that are to be read by pineappl + as well as metadata that fixes things like conversion factors or apfelcomb flag. + The fkspec contains also the cfactors which are applied _directly_ to each of the fktables. + + The output of this function is an instance of FKTableData which can be generated from reading + several FKTable files which get concatenated on the ndata (bin) axis. + + For more information on the reading of pineappl tables: + https://pineappl.readthedocs.io/en/latest/modules/pineappl/pineappl.html#pineappl.pineappl.PyFkTable + + About the reader: + Each pineappl table is a 4-dimensional grid with: + (ndata, active channels, x1, x2) + for DIS grids x2 will contain one single number. + The luminosity channels are given in a (flav1, flav2) format and thus need to be converted + to the 1-D index of a (14x14) luminosity tensor in order to put in the form of a dataframe. + + All grids in pineappl are constructed with the exact same xgrid, + the active channels can vary and so when grids are concatenated for an observable + the gaps are filled with 0s. + + The pineappl grids are such that obs = sum_{bins} fk * f (*f) * bin_w + so in order to use them together with old-style grids (obs = sum_{bins} fk * xf (*xf)) + it is necessary to remove the factor of x and the normalization of the bins. + + About apfelcomb flags in yamldb files: + old commondata files and old grids have over time been through various iterations while remaining compatibility between each other, + and fixes and hacks have been incorporated in one or another + for the new theory to be compatible with old commpondata it is necessary + to keep track of said hacks (and to apply conversion factors when required) + NOTE: both conversion factors and apfelcomb flags will be eventually removed. + + Returns + ------- + validphys.coredata.FKTableData + an FKTableData object containing all necessary information to compute predictions + """ + from pineappl.fk_table import FkTable + + pines = [] + for fk_path in fkspec.fkpath: + try: + pines.append(FkTable.read(fk_path)) + except BaseException as e: + # Catch absolutely any error coming from pineappl, give some info and immediately raise + log.error(f"Fatal error reading {fk_path}") + raise e + + cfactors = fkspec.load_cfactors() + + # Extract metadata from the first grid + pine_rep = pines[0] + + # Is it hadronic? (at the moment only hadronic and DIS are considered) + hadronic = pine_rep.key_values()["initial_state_1"] == pine_rep.key_values()["initial_state_2"] + # Sanity check (in case at some point we start fitting things that are not protons) + if hadronic and pine_rep.key_values()["initial_state_1"] != "2212": + raise ValueError( + "pineappl_reader is not prepared to read a hadronic fktable with no protons!" + ) + Q0 = np.sqrt(pine_rep.muf2()) + xgrid = np.array([]) + for pine in pines: + xgrid = np.union1d(xgrid, pine.x_grid()) + xi = np.arange(len(xgrid)) + protected = False + + # Process the shifts and normalizations (if any), + # shifts is a dictionary with {fktable_name: shift_value} + # normalization instead {fktable_name: normalization to apply} + # since this parser doesn't know about operations, we need to convert it to a list + # then we just iterate over the fktables and apply the shift in the right order + shifts = fkspec.theory_meta.shifts + normalization_per_fktable = fkspec.theory_meta.normalization + fknames = [i.name.replace(f".{EXT}", "") for i in fkspec.fkpath] + if cfactors is not None: + cfactors = dict(zip(fknames, cfactors)) + + # fktables in pineapplgrid are for obs = fk * f while previous fktables were obs = fk * xf + # prepare the grid all tables will be divided by + if hadronic: + xdivision = np.prod(np.meshgrid(xgrid, xgrid), axis=0) + else: + xdivision = xgrid[:, np.newaxis] + + partial_fktables = [] + ndata = 0 + for fkname, p in zip(fknames, pines): + # Start by reading possible cfactors if cfactor is not empty + cfprod = 1.0 + if cfactors is not None: + for cfac in cfactors.get(fkname, []): + cfprod *= cfac.central_value + + # Read the table, remove bin normalization and apply cfactors + raw_fktable = (cfprod * p.table().T / p.bin_normalizations()).T + n = raw_fktable.shape[0] + + # Apply possible per-fktable fixes + if shifts is not None: + ndata += shifts.get(fkname, 0) + + if normalization_per_fktable is not None: + raw_fktable = raw_fktable * normalization_per_fktable.get(fkname, 1.0) + + # Add empty points to ensure that all fktables share the same x-grid upon convolution + missing_x_points = np.setdiff1d(xgrid, p.x_grid(), assume_unique=True) + for x_point in missing_x_points: + miss_index = list(xgrid).index(x_point) + raw_fktable = np.insert(raw_fktable, miss_index, 0.0, axis=2) + if hadronic: + raw_fktable = np.insert(raw_fktable, miss_index, 0.0, axis=3) + # Check conversion factors and remove the x* from the fktable + raw_fktable *= fkspec.theory_meta.conversion_factor / xdivision + + # Create the multi-index for the dataframe + # for optimized pineappls different grids can potentially have different indices + # so they need to be indexed separately and then concatenated only at the end + lumi_columns = _pinelumi_to_columns(p.lumi(), hadronic) + lf = len(lumi_columns) + data_idx = np.arange(ndata, ndata + n) + if hadronic: + idx = pd.MultiIndex.from_product([data_idx, xi, xi], names=["data", "x1", "x2"]) + else: + idx = pd.MultiIndex.from_product([data_idx, xi], names=["data", "x"]) + + # Now concatenate (data, x1, x2) and move the flavours to the columns + df_fktable = raw_fktable.swapaxes(0, 1).reshape(lf, -1).T + partial_fktables.append(pd.DataFrame(df_fktable, columns=lumi_columns, index=idx)) + + ndata += n + + # Finallly concatenate all fktables, sort by flavours and fill any holes + sigma = pd.concat(partial_fktables, sort=True, copy=False).fillna(0.0) + + # Check whether this is a 1-point normalization fktable and, if that's the case, protect! + if fkspec.theory_meta.operation == "RATIO" and len(pines) == 1: + # it _might_ be, check whether it is the divisor fktable + divisor = fkspec.theory_meta.FK_tables[-1][0] + name = fkspec.fkpath[0].name.replace(f".{EXT}", "") + + if np.allclose(sigma.loc[1:], 0.0): + # Old denominator fktables were filled with 0s beyond the first point + # and they needed to be post-processed to repeat the same point many time + # Instead, drop everything beyond the 1st point (used 0:0 to keep the same kind of df) + sigma = sigma.loc[0:0] + ndata = 1 + + if ndata == 1: + # There's no doubt + protected = divisor == name + + return FKTableData( + sigma=sigma, + ndata=ndata, + Q0=Q0, + metadata=fkspec.theory_meta, + hadronic=hadronic, + xgrid=xgrid, + ) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 4c682e2f7..97121ffe3 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -278,8 +278,6 @@ def level0_commondata_wc( level0_commondata_instances_wc = [] - # import IPython; IPython.embed() - for dataset in data.datasets: commondata_wc = dataset.commondata.load_commondata() @@ -290,6 +288,31 @@ def level0_commondata_wc( # == Generate a new CommonData instance with central value given by Level 0 data generated with fakepdf ==# t0_prediction = dataset_t0_predictions(dataset=dataset, t0set=fakepdf) + # Contamination + if dataset.contamination: + theoryid = dataset.thspec.id + cont_path = l.datapath / f"theory_{theoryid}" / "simu_factors" / f"SIMU_{dataset.name}.yaml" + # load contamination parameters + cont_params = dataset.contamination_data + # load simu_card file + with open(cont_path, "r+") as stream: + simu_card = yaml.safe_load(stream) + stream.close() + # K-factors loading + k_factor = np.zeros(len(t0_prediction)) + for param in cont_params: + # load the k_fac value + value = param["value"] + # load the linear combination coefficients + lin_comb = param["linear_combination"] + # load the BMS cross-section + bsm_xs = np.zeros(len(t0_prediction)) + for op in lin_comb: + bsm_xs += lin_comb[op] * np.array(simu_card[dataset.contamination][op])[cuts] + # compute the K-factor correction + k_factor += value * bsm_xs / np.array(simu_card[dataset.contamination]["SM"])[cuts] + # update t0 prediction to BSM t0 prediction + t0_prediction *= (1. + k_factor) # N.B. cuts already applied to th. pred. level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction)) diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index 0c2956daa..f7bdb3e55 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -56,7 +56,6 @@ def parse_yaml_inp(inp, spec, path): current_exc = current_exc.__cause__ raise ValidationError('\n'.join(error_text_lines)) from e - @contextlib.contextmanager def tempfile_cleaner(root, exit_func, exc, prefix=None, **kwargs): """A context manager to handle temporary directory creation and