From f6ab038306f80c444e38a407b154f1744366fc17 Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 14 May 2024 12:23:10 +0100 Subject: [PATCH 01/14] added load_commondata to core, level0_commondata_wc and make_level1_data to pseudodata --- validphys2/src/validphys/core.py | 12 ++ validphys2/src/validphys/pseudodata.py | 152 ++++++++++++++++++++++++- 2 files changed, 163 insertions(+), 1 deletion(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 0772d948a..be329c5d9 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -316,6 +316,18 @@ def load(self)->CommonData: #TODO: Use better path handling in python 3.6 return CommonData.ReadFile(str(self.datafile), str(self.sysfile)) + def load_commondata(self, cuts=None): + """ + Loads a coredata.CommonData object from a core.CommonDataSetSpec object + cuts are applied if provided. + """ + # import here to avoid circular imports + from validphys.commondataparser import load_commondata + cd = load_commondata(self) + if cuts is not None: + cd = cd.with_cuts(cuts) + return cd + @property def plot_kinlabels(self): return get_plot_kinlabels(self) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 9fd5863a9..34f2a6c82 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -10,7 +10,9 @@ import numpy as np import pandas as pd -from validphys.covmats import INTRA_DATASET_SYS_NAME +from validphys.covmats import INTRA_DATASET_SYS_NAME, dataset_t0_predictions + +from validphys.core import PDF from reportengine import collect @@ -235,6 +237,154 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) +def level0_commondata_wc(data, fakepdf): + """ + Given a validphys.core.DataGroupSpec object, load commondata and + generate a new commondata instance with central values replaced + by fakepdf prediction + + Parameters + ---------- + + data : validphys.core.DataGroupSpec + + fakepdf: str + + Returns + ------- + list + list of validphys.coredata.CommonData instances corresponding to + all datasets within one experiment. The central value is replaced + by Level 0 fake data. + + Example + ------- + >>> from validphys.api import API + >>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], use_cuts="internal", theoryid=200, fakepdf="NNPDF40_nnlo_as_01180") + + [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] + """ + + level0_commondata_instances_wc = [] + + # argument fakepdf is passed as str + # generate a core.PDF object with name equal to fakepdf argument + fakepdf = PDF(name=fakepdf) + + for dataset in data.datasets: + commondata_wc = dataset.commondata.load_commondata() + if dataset.cuts is not None: + cuts = dataset.cuts.load() + commondata_wc = commondata_wc.with_cuts(cuts) + # == Generate a new CommonData instance with central value given by Level 0 data generated with fakepdf ==# + t0_prediction = dataset_t0_predictions( + dataset=dataset, t0set=fakepdf + ) # N.B. cuts already applied to th. pred. + level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction)) + + return level0_commondata_instances_wc + + +def make_level1_data( + # data, + level0_commondata_wc, + filterseed, + data_index): + """ + Given a list of Level 0 commondata instances, return the + same list with central values replaced by Level 1 data. + + Level 1 data is generated using validphys.make_replica. + The covariance matrix, from which the stochastic Level 1 + noise is sampled, is built from Level 0 commondata + instances (level0_commondata_wc). This, in particular, + means that the multiplicative systematics are generated + from the Level 0 central values. + + Note that the covariance matrix used to generate Level 2 + pseudodata is consistent with the one used at Level 1 + up to corrections of the order eta * eps, where eta and + eps are defined as shown below: + + Generate L1 data: L1 = L0 + eta, eta ~ N(0,CL0) + Generate L2 data: L2_k = L1 + eps_k, eps_k ~ N(0,CL1) + + where CL0 and CL1 means that the multiplicative entries + have been constructed from Level 0 and Level 1 central + values respectively. + + + Parameters + ---------- + + REMOVE: data : validphys.core.DataGroupSpec + + level0_commondata_wc : list + list of validphys.coredata.CommonData instances corresponding to + all datasets within one experiment. The central value is replaced + by Level 0 fake data. Cuts already applied. + + filterseed : int + random seed used for the generation of Level 1 data + + data_index : pandas.MultiIndex + + Returns + ------- + list + list of validphys.coredata.CommonData instances corresponding to + all datasets within one experiment. The central value is replaced + by Level 1 fake data. + + Example + ------- + + >>> from validphys.api import API + >>> dataset='NMC' + >>> l1_cd = API.make_level1_data(dataset_inputs = [{"dataset":dataset}],use_cuts="internal", theoryid=200, + fakepdf = "NNPDF40_nnlo_as_01180", filterseed=1, data_index) + >>> l1_cd + [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] + """ + + # from validphys.covmats import dataset_inputs_covmat_from_systematics + + # dataset_input_list = list(data.dsinputs) + + # covmat = dataset_inputs_covmat_from_systematics( + # level0_commondata_wc, + # dataset_input_list, + # use_weights_in_covmat=False, + # norm_threshold=None, + # _list_of_central_values=None, + # ) + + # ================== generation of Level1 data ======================# + level1_data = make_replica( + level0_commondata_wc, + filterseed, + # covmat, + genrep=True + ) + + indexed_level1_data = indexed_make_replica(data_index, level1_data) + + dataset_order = {cd.setname: i for i, cd in enumerate(level0_commondata_wc)} + + # ===== create commondata instances with central values given by pseudo_data =====# + level1_commondata_dict = {c.setname: c for c in level0_commondata_wc} + level1_commondata_instances_wc = [] + + for xx, grp in indexed_level1_data.groupby('dataset'): + level1_commondata_instances_wc.append( + level1_commondata_dict[xx].with_central_value(grp.values) + ) + # sort back so as to mantain same order as in level0_commondata_wc + level1_commondata_instances_wc.sort(key=lambda x: dataset_order[x.setname]) + + return level1_commondata_instances_wc + + _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) _recreate_pdf_pseudodata = collect('_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment')) From 0b5fd161b75083a1de0403815c29c6b0ff6e6763 Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Thu, 16 May 2024 17:02:40 +0100 Subject: [PATCH 02/14] added parse_fakepdf to config.py --- validphys2/src/validphys/config.py | 5 +++++ validphys2/src/validphys/pseudodata.py | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 8375b3f2e..bad0bd964 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -38,6 +38,7 @@ MatchedCuts, SimilarCuts, ThCovMatSpec, + PDF, ) from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas from validphys.loader import ( @@ -171,6 +172,10 @@ def parse_pdf(self, name: str): except NotImplementedError as e: raise ConfigError(str(e)) return pdf + + def parse_fakepdf(self, name: str) -> PDF: + """PDF set used to generate the fake data in a closure test.""" + return self.parse_pdf(name) def parse_load_weights_from_fit(self, name: str): """A fit in the results folder, containing at least a valid filter result.""" diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 34f2a6c82..37917b108 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -248,7 +248,7 @@ def level0_commondata_wc(data, fakepdf): data : validphys.core.DataGroupSpec - fakepdf: str + fakepdf: validphys.core.PDF Returns ------- @@ -269,7 +269,7 @@ def level0_commondata_wc(data, fakepdf): # argument fakepdf is passed as str # generate a core.PDF object with name equal to fakepdf argument - fakepdf = PDF(name=fakepdf) + # fakepdf = PDF(name=fakepdf) for dataset in data.datasets: commondata_wc = dataset.commondata.load_commondata() From eb19f727bb845a76df80e921dd1d44cdbde2503f Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Mon, 20 May 2024 10:14:53 +0100 Subject: [PATCH 03/14] add chi2 provider functions --- validphys2/src/validphys/coredata.py | 4 + validphys2/src/validphys/pseudodata.py | 304 ++++++++++++++++++++++--- validphys2/src/validphys/results.py | 59 ++++- 3 files changed, 328 insertions(+), 39 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 4b8c3cd3d..c7f3418d4 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -263,6 +263,10 @@ def additive_errors(self): add_table.columns = add_systype["name"].to_numpy() return add_table.loc[:, add_table.columns != "SKIP"] + @property + def cuts(self): + indices = self.commondata_table.index + return slice(indices[0]-1, indices[-1]) def systematic_errors(self, central_values=None): """Returns all systematic errors as absolute uncertainties, with a diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 37917b108..1d82e4ef8 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -9,10 +9,13 @@ import numpy as np import pandas as pd +import os +import yaml from validphys.covmats import INTRA_DATASET_SYS_NAME, dataset_t0_predictions -from validphys.core import PDF +from validphys.convolution import central_predictions +from validphys.loader import Loader from reportengine import collect @@ -20,6 +23,8 @@ log = logging.getLogger(__name__) +l = Loader() + DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx']) context_index = collect("groups_index", ("fitcontext",)) @@ -237,7 +242,10 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) -def level0_commondata_wc(data, fakepdf): +def level0_commondata_wc( + data, + fakepdf + ): """ Given a validphys.core.DataGroupSpec object, load commondata and generate a new commondata instance with central values replaced @@ -260,33 +268,35 @@ def level0_commondata_wc(data, fakepdf): Example ------- >>> from validphys.api import API - >>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], use_cuts="internal", theoryid=200, fakepdf="NNPDF40_nnlo_as_01180") + >>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], + use_cuts="internal", + theoryid=200, + fakepdf="NNPDF40_nnlo_as_01180") [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] """ level0_commondata_instances_wc = [] - # argument fakepdf is passed as str - # generate a core.PDF object with name equal to fakepdf argument - # fakepdf = PDF(name=fakepdf) + # import IPython; IPython.embed() for dataset in data.datasets: + commondata_wc = dataset.commondata.load_commondata() if dataset.cuts is not None: cuts = dataset.cuts.load() - commondata_wc = commondata_wc.with_cuts(cuts) + commondata_wc = commondata_wc.with_cuts(cuts=cuts) + # == Generate a new CommonData instance with central value given by Level 0 data generated with fakepdf ==# - t0_prediction = dataset_t0_predictions( - dataset=dataset, t0set=fakepdf - ) # N.B. cuts already applied to th. pred. + t0_prediction = dataset_t0_predictions(dataset=dataset, + t0set=fakepdf) + # N.B. cuts already applied to th. pred. level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction)) return level0_commondata_instances_wc def make_level1_data( - # data, level0_commondata_wc, filterseed, data_index): @@ -317,8 +327,6 @@ def make_level1_data( Parameters ---------- - REMOVE: data : validphys.core.DataGroupSpec - level0_commondata_wc : list list of validphys.coredata.CommonData instances corresponding to all datasets within one experiment. The central value is replaced @@ -340,32 +348,20 @@ def make_level1_data( ------- >>> from validphys.api import API - >>> dataset='NMC' - >>> l1_cd = API.make_level1_data(dataset_inputs = [{"dataset":dataset}],use_cuts="internal", theoryid=200, - fakepdf = "NNPDF40_nnlo_as_01180", filterseed=1, data_index) - >>> l1_cd + >>> API.make_level1_data(dataset_inputs=[{"dataset": "NMC"}], + use_cuts="internal", + theoryid=200, + fakepdf="NNPDF40_nnlo_as_01180", + filterseed=0, + data_index) [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] """ - # from validphys.covmats import dataset_inputs_covmat_from_systematics - - # dataset_input_list = list(data.dsinputs) - - # covmat = dataset_inputs_covmat_from_systematics( - # level0_commondata_wc, - # dataset_input_list, - # use_weights_in_covmat=False, - # norm_threshold=None, - # _list_of_central_values=None, - # ) - # ================== generation of Level1 data ======================# - level1_data = make_replica( - level0_commondata_wc, - filterseed, - # covmat, - genrep=True - ) + level1_data = make_replica(level0_commondata_wc, + filterseed, + genrep=True, + ) indexed_level1_data = indexed_make_replica(data_index, level1_data) @@ -385,6 +381,244 @@ def make_level1_data( return level1_commondata_instances_wc +def make_level1_list_data( + level0_commondata_wc, + filterseed, + n_samples, + data_index, +): + """ + Given a list of validphys.coredata.CommonData instances with central + values replaced with `fakepdf` predictions with cuts applied + generate a list of level 1 data from such instances + + Parameters + ---------- + + level0_commondata:_wc: list of validphys.coredata.CommonData instances + where the central value is replaced by level 0 + `fakepdf` predictions + + filterseed: int starting seed used to make different replicas + + n_samples: int number of replicas + + data_index: pandas.MultiIndex providing information on the experiment, + the dataset, and the cut index + + Returns + ------- + list + list of lists of validphys.coredata.CommonData instances corresponding + to all datasets within one experiment. The central value is replaced + by Level 1 fake data. + + Example + ------- + >>> from validphys.api import API + >>> from validphys.loader import Loader + >>> from validphys.results import data_index + >>> l = Loader() + >>> dataset = l.check_dataset(name="NMC", theoryid=200) + >>> experiment = l.check_experiment(name="data", datasets=[dataset]) + >>> lv0_cd_wc = API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], + use_cuts="internal", + theoryid=200, + fakepdf="NNPDF40_nnlo_as_01180" + ) + >>> API.make_level1_list_data(level0_commondata_wc=lv0_cd_wc, + filterseed=0, + n_samples=1, + data_index=data_index(experiment) + ) + + [[CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)]] + """ + samples = [make_level1_data(level0_commondata_wc=level0_commondata_wc, + filterseed=filterseed+i, + data_index=data_index) for i in range(n_samples)] + + return samples + + +def sm_predictions( + dataset_inputs, + pdf, + theoryid + ): + + """ + Parameters + ---------- + dataset_inputs: NSList of core.DataSetInput objects + + pdf: core.PDF object + + theoryid: TheoryIDSpec + + Returns + ------- + + dict + dictionary of standard model predictions for the + given dataset_input, pdf, and theory + + """ + + sm_dict = {} + + for dataset in dataset_inputs: + data = l.check_dataset(dataset.name, cfac=dataset.cfac, theoryid=theoryid) + + sm_dict[dataset.name] = central_predictions(data, pdf) + + return sm_dict + + +def load_contamination( + contamination_parameters, + theoryid, + dataset_inputs + ): + + """ + Parameters + ---------- + + contamination_parameters: dict with + + theoryid: TheoryIDSpec + + dataset_inputs: NSList of DataSetInput objects + + Returns + ------- + + dict + dictionary of BSM k-factors to apply on certain datasets + + """ + + cont_path = l.datapath / f"theory_{theoryid.id}" / "simu_factors" + + cont_name = contamination_parameters["name"] + cont_value = contamination_parameters["value"] + cont_lin_comb = contamination_parameters["linear_combination"] + + bsm_dict = {} + + for dataset in dataset_inputs: + + bsmfile = cont_path / f"SIMU_{dataset.name}.yaml" + + cont_order = dataset.contamination + + if cont_order == None: + log.warning( + f"{dataset.name} is not contaminated. Is it right?" + ) + bsm_dict[dataset.name] = np.array([1.]) + elif not os.path.exists(bsmfile): + log.error( + f"Could not find a BSM-factor for {dataset.name}. Are you sure they exist in the given theory?" + ) + bsm_dict[dataset.name] = np.array([1.]) + else: + log.info( + f"Loading {dataset.name}" + ) + with open(bsmfile, "r+") as stream: + simu_card = yaml.safe_load(stream) + stream.close() + + k_factors = np.zeros(len(simu_card["SM_fixed"])) + for op in cont_lin_comb: + k_factors += cont_lin_comb[op] * np.array(simu_card[cont_order][op]) + k_factors = 1. + k_factors * cont_value / np.array(simu_card[cont_order]["SM"]) + + bsm_dict[dataset.name] = k_factors + + return bsm_dict + + +def compute_chi2( + make_level1_list_data, + sm_predictions, + groups_covmat, + load_contamination + ): + + """ + Parameters + ---------- + + make_level1_list_data + + sm_predictions + + groups_covmat + + load_contamination + + Returns + ------- + + dict + dictionary of lists of chi2 per dataset + + """ + + covmat = groups_covmat + samples = make_level1_list_data + bsm_factors = load_contamination + + chi2_dict = {dataset.setname: [] for dataset in samples[0]} + + for sample in samples: + + for dataset in sample: + data_name = dataset.setname + bsm_fac = bsm_factors[data_name] + + if bsm_fac.shape[0] == 1: + data_values = dataset.central_values * bsm_fac + else: + cuts = dataset.cuts + data_values = dataset.central_values * bsm_fac[cuts] + + num_data = dataset.ndata + + covmat_dataset = ( + covmat.xs(data_name, level=1, drop_level=False) + .T.xs(data_name, level=1, drop_level=False) + .values + ) + + theory = sm_predictions[data_name].values.squeeze() + + diff = (data_values - theory).squeeze() + + if diff.size == 1: + chi2 = diff**2 / covmat_dataset[0,0] / num_data + else: + chi2 = (diff.T @ np.linalg.inv(covmat_dataset) @ diff) / num_data + + chi2_dict[data_name].append(chi2) + + return chi2_dict + + +def write_chi2(pdf, compute_chi2, level0_commondata_wc): + + ndata = {dataset.setname: [dataset.ndata] for dataset in level0_commondata_wc} + + df_ndat = pd.DataFrame(ndata) + df_chi2 = pd.DataFrame(compute_chi2) + + chi2 = pd.concat([df_ndat, df_chi2], ignore_index=True) + + chi2.to_csv(f"{pdf}_chi2_dist.csv", index=False) + _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) _recreate_pdf_pseudodata = collect('_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment')) diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 950a11af1..48a9f5222 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -37,6 +37,8 @@ predictions, PredictionsRequireCutsError, ) +from validphys.plotoptions.core import get_info + from validphys.n3fit_data_utils import parse_simu_parameters_names_CF @@ -169,6 +171,55 @@ def from_convolution(cls, pdf, posset): procs_data = collect("data", ("group_dataset_inputs_by_process",)) +def data_index(data): + + """ + Parameters + ---------- + + data: core.DataGroupSpec + + Returns + ------- + + pandas.MultiIndex + + Example + ------- + + >>> from validphys.loader import Loader + >>> from validphys.results import data_index + >>> l = Loader() + >>> dataset = l.check_dataset(name="NMC", + theoryid=200 + ) + >>> experiment = l.check_experiment(name="data", + datasets=[dataset] + ) + >>> data_index(experiment) + + MultiIndex([('NMC', 'NMC', 16), + ('NMC', 'NMC', 21), + ('NMC', 'NMC', 22), + ... + ('NMC', 'NMC', 289), + ('NMC', 'NMC', 290), + ('NMC', 'NMC', 291)], + names=['experiment', 'dataset', 'id'], length=204) + + """ + + tuples = [] + + for dataset in data.datasets: + exp = get_info(dataset).experiment + for i in dataset.cuts.load(): + tp = (exp, dataset.name, i) + tuples.append(tp) + + return pd.MultiIndex.from_tuples(tuples, names=('experiment', 'dataset', 'id')) + + def groups_index(groups_data): """Return a pandas.MultiIndex with levels for group, dataset and point respectively, the group is determined by a key in the dataset metadata, and @@ -541,7 +592,7 @@ def dataset_inputs_results( # ``results`` to support this. # TODO: The above comment doesn't make sense after adding T0. Deprecate this def pdf_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore pdfs: Sequence, covariance_matrix, sqrt_covmat, @@ -557,12 +608,12 @@ def pdf_results( @require_one("pdfs", "pdf") @remove_outer("pdfs", "pdf") def one_or_more_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore covariance_matrix, sqrt_covmat, dataset_bsm_factor, - pdfs: (type(None), Sequence) = None, - pdf: (type(None), PDF) = None, + pdfs: (type(None), Sequence) = None, # type: ignore + pdf: (type(None), PDF) = None, # type: ignore ): """Generate a list of results, where the first element is the data values, and the next is either the prediction for pdf or for each of the pdfs. From 7875a55cc88be35bf46b57645ecdb8abfb7892eb Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Mon, 20 May 2024 10:43:19 +0100 Subject: [PATCH 04/14] added usage write_chi2 --- validphys2/src/validphys/pseudodata.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 1d82e4ef8..fd6fafc7f 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -608,7 +608,26 @@ def compute_chi2( return chi2_dict -def write_chi2(pdf, compute_chi2, level0_commondata_wc): +def write_chi2( + pdf, + compute_chi2, + level0_commondata_wc + ): + + """ + Parameters + ---------- + + pdf: core.PDF + + compute_chi2 + + level0_commondata_wc + + Returns + ------- + + """ ndata = {dataset.setname: [dataset.ndata] for dataset in level0_commondata_wc} @@ -617,7 +636,7 @@ def write_chi2(pdf, compute_chi2, level0_commondata_wc): chi2 = pd.concat([df_ndat, df_chi2], ignore_index=True) - chi2.to_csv(f"{pdf}_chi2_dist.csv", index=False) + chi2.to_csv(f"output/tables/{pdf}_chi2_dist.csv", index=False) _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) From 18e64175880d5ff8c56566676b4254b73d6b9e0f Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Mon, 20 May 2024 10:46:39 +0100 Subject: [PATCH 05/14] fixed repo --- validphys2/src/validphys/pseudodata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index fd6fafc7f..67ed1431b 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -636,7 +636,7 @@ def write_chi2( chi2 = pd.concat([df_ndat, df_chi2], ignore_index=True) - chi2.to_csv(f"output/tables/{pdf}_chi2_dist.csv", index=False) + chi2.to_csv(f"{pdf}_chi2_dist.csv", index=False) _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) From 104dd5f4c81c7b38f0fb19cce337412897d4c273 Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 17:26:27 +0100 Subject: [PATCH 06/14] added k-factor loading in dataset_bsm_factor --- validphys2/src/validphys/dataplots.py | 8 ++-- validphys2/src/validphys/results.py | 60 +++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 12 deletions(-) diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index af2e7b657..b307691f1 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -199,7 +199,7 @@ def check_normalize_to(ns, **kwargs): #TODO: This interface is horrible. We need to think how to adapt libnnpdf #to make this use case easier def _plot_fancy_impl(results, commondata, cutlist, - normalize_to:(int,type(None)) = None, labellist=None): + normalize_to:(int,type(None)) = None, labellist=None): # type: ignore """Implementation of the data-theory comparison plots. Providers are supposed to call (yield from) this. @@ -383,7 +383,7 @@ def _plot_fancy_impl(results, commondata, cutlist, @check_normalize_to @figuregen def plot_fancy(one_or_more_results, commondata, cuts, - normalize_to: (int, str, type(None)) = None): + normalize_to: (int, str, type(None)) = None): # type: ignore """ Read the PLOTTING configuration for the dataset and generate the corrspondig data theory plot. @@ -439,7 +439,7 @@ def plot_fancy_dataspecs( dataspecs_commondata, dataspecs_cuts, dataspecs_speclabel, - normalize_to: (str, int, type(None)) = None, + normalize_to: (str, int, type(None)) = None, # type: ignore ): """ General interface for data-theory comparison plots. @@ -1069,7 +1069,7 @@ def plot_xq2( display_cuts:bool=True, marker_by:str='process type', highlight_label:str='highlight', - highlight_datasets:(Sequence,type(None))=None, + highlight_datasets:(Sequence,type(None))=None, # type: ignore aspect:str='landscape', ): """Plot the (x,Q²) coverage based of the data based on some LO diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 48a9f5222..fba771e25 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -12,6 +12,8 @@ import numpy as np import pandas as pd import scipy.linalg as la +import os +import yaml from NNPDF import CommonData from reportengine.checks import require_one, remove_outer, check_not_empty @@ -38,13 +40,13 @@ PredictionsRequireCutsError, ) from validphys.plotoptions.core import get_info - +from validphys.loader import Loader from validphys.n3fit_data_utils import parse_simu_parameters_names_CF log = logging.getLogger(__name__) - +l = Loader() class Result: pass @@ -359,16 +361,16 @@ def group_result_table_68cl( return res -def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs): +def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): """Same as :py:func:`validphys.results.dataset_bsm_factor` but for a list of dataset inputs. """ res = np.concatenate( - [dataset_bsm_factor(dataset, pdf, read_bsm_facs) for dataset in data.datasets] + [dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters, theoryid) for dataset in data.datasets] ) return res -def dataset_bsm_factor(dataset, pdf, read_bsm_facs): +def dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): """For each replica of ``pdf``, scale the fitted BSM-factors by the best fit value. Returns @@ -382,12 +384,54 @@ def dataset_bsm_factor(dataset, pdf, read_bsm_facs): dataset.cuts ) - if parsed_bsm_facs is None: + ndata = len(dataset.load().get_cv()) + nrep = len(pdf) + + + if parsed_bsm_facs is None and dataset.contamination is None: # We want an array of ones that ndata x nrep # where ndata is the number of post cut datapoints - ndata = len(dataset.load().get_cv()) - nrep = len(pdf) return np.ones((ndata, nrep)) + + + # after running a contamination fit we do not get fitted bsm factors in a csv + # hence, to produce a comparison between the data used for the fitting and the + # theory prediction of such fit, we must load k-factors in another way + # we write the contamination name, value (e.g. zhat), and linear combination + # in the yaml file, and if the `contamination` keyword is present in the dataset_input + # we access the `SIMU_` file of the desired dataset, and load the k-factors + if dataset.contamination: + + cont_order = dataset.contamination + bsm_path = l.datapath / f"theory_{theoryid.id}" / "simu_factors" + + if contamination_parameters is None: + log.warning("No Contamination parameters provided") + return np.ones((ndata, nrep)) + + else: + bsm_file = bsm_path / f"SIMU_{dataset.name}.yaml" + if not os.path.exists(bsm_file): + log.error(f"Could not find a BSM-factor for {dataset.name}. Are you sure they exist in the given theory?") + return np.ones((ndata, nrep)) + + log.info(f"Loading {dataset.name}") + + cont_name = contamination_parameters["name"] + cont_value = contamination_parameters["value"] + cont_lin_comb = contamination_parameters["linear_combination"] + + with open(bsm_file, "r+") as stream: + simu_card = yaml.safe_load(stream) + stream.close() + + k_factors = np.zeros(len(simu_card["SM_fixed"])) + for op in cont_lin_comb: + k_factors += cont_lin_comb[op] * np.array(simu_card[cont_order][op]) + k_factors = 1. + k_factors * cont_value / np.array(simu_card[cont_order]["SM"]) + + cuts = dataset.cuts.load() + return np.array([k_factors[cuts]] * nrep).T fit_bsm_fac_df = pd.DataFrame( {k: v.central_value for k, v in parsed_bsm_facs.items()} From 2d3e7de23d84ebe369e17ba0b58eff32e6ad29ac Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 17:52:39 +0100 Subject: [PATCH 07/14] Revert "added k-factor loading in dataset_bsm_factor" This reverts commit 104dd5f4c81c7b38f0fb19cce337412897d4c273. --- validphys2/src/validphys/dataplots.py | 8 ++-- validphys2/src/validphys/results.py | 60 ++++----------------------- 2 files changed, 12 insertions(+), 56 deletions(-) diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index b307691f1..af2e7b657 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -199,7 +199,7 @@ def check_normalize_to(ns, **kwargs): #TODO: This interface is horrible. We need to think how to adapt libnnpdf #to make this use case easier def _plot_fancy_impl(results, commondata, cutlist, - normalize_to:(int,type(None)) = None, labellist=None): # type: ignore + normalize_to:(int,type(None)) = None, labellist=None): """Implementation of the data-theory comparison plots. Providers are supposed to call (yield from) this. @@ -383,7 +383,7 @@ def _plot_fancy_impl(results, commondata, cutlist, @check_normalize_to @figuregen def plot_fancy(one_or_more_results, commondata, cuts, - normalize_to: (int, str, type(None)) = None): # type: ignore + normalize_to: (int, str, type(None)) = None): """ Read the PLOTTING configuration for the dataset and generate the corrspondig data theory plot. @@ -439,7 +439,7 @@ def plot_fancy_dataspecs( dataspecs_commondata, dataspecs_cuts, dataspecs_speclabel, - normalize_to: (str, int, type(None)) = None, # type: ignore + normalize_to: (str, int, type(None)) = None, ): """ General interface for data-theory comparison plots. @@ -1069,7 +1069,7 @@ def plot_xq2( display_cuts:bool=True, marker_by:str='process type', highlight_label:str='highlight', - highlight_datasets:(Sequence,type(None))=None, # type: ignore + highlight_datasets:(Sequence,type(None))=None, aspect:str='landscape', ): """Plot the (x,Q²) coverage based of the data based on some LO diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index fba771e25..48a9f5222 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -12,8 +12,6 @@ import numpy as np import pandas as pd import scipy.linalg as la -import os -import yaml from NNPDF import CommonData from reportengine.checks import require_one, remove_outer, check_not_empty @@ -40,13 +38,13 @@ PredictionsRequireCutsError, ) from validphys.plotoptions.core import get_info -from validphys.loader import Loader + from validphys.n3fit_data_utils import parse_simu_parameters_names_CF log = logging.getLogger(__name__) -l = Loader() + class Result: pass @@ -361,16 +359,16 @@ def group_result_table_68cl( return res -def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): +def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs): """Same as :py:func:`validphys.results.dataset_bsm_factor` but for a list of dataset inputs. """ res = np.concatenate( - [dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters, theoryid) for dataset in data.datasets] + [dataset_bsm_factor(dataset, pdf, read_bsm_facs) for dataset in data.datasets] ) return res -def dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): +def dataset_bsm_factor(dataset, pdf, read_bsm_facs): """For each replica of ``pdf``, scale the fitted BSM-factors by the best fit value. Returns @@ -384,54 +382,12 @@ def dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters=Non dataset.cuts ) - ndata = len(dataset.load().get_cv()) - nrep = len(pdf) - - - if parsed_bsm_facs is None and dataset.contamination is None: + if parsed_bsm_facs is None: # We want an array of ones that ndata x nrep # where ndata is the number of post cut datapoints + ndata = len(dataset.load().get_cv()) + nrep = len(pdf) return np.ones((ndata, nrep)) - - - # after running a contamination fit we do not get fitted bsm factors in a csv - # hence, to produce a comparison between the data used for the fitting and the - # theory prediction of such fit, we must load k-factors in another way - # we write the contamination name, value (e.g. zhat), and linear combination - # in the yaml file, and if the `contamination` keyword is present in the dataset_input - # we access the `SIMU_` file of the desired dataset, and load the k-factors - if dataset.contamination: - - cont_order = dataset.contamination - bsm_path = l.datapath / f"theory_{theoryid.id}" / "simu_factors" - - if contamination_parameters is None: - log.warning("No Contamination parameters provided") - return np.ones((ndata, nrep)) - - else: - bsm_file = bsm_path / f"SIMU_{dataset.name}.yaml" - if not os.path.exists(bsm_file): - log.error(f"Could not find a BSM-factor for {dataset.name}. Are you sure they exist in the given theory?") - return np.ones((ndata, nrep)) - - log.info(f"Loading {dataset.name}") - - cont_name = contamination_parameters["name"] - cont_value = contamination_parameters["value"] - cont_lin_comb = contamination_parameters["linear_combination"] - - with open(bsm_file, "r+") as stream: - simu_card = yaml.safe_load(stream) - stream.close() - - k_factors = np.zeros(len(simu_card["SM_fixed"])) - for op in cont_lin_comb: - k_factors += cont_lin_comb[op] * np.array(simu_card[cont_order][op]) - k_factors = 1. + k_factors * cont_value / np.array(simu_card[cont_order]["SM"]) - - cuts = dataset.cuts.load() - return np.array([k_factors[cuts]] * nrep).T fit_bsm_fac_df = pd.DataFrame( {k: v.central_value for k, v in parsed_bsm_facs.items()} From 08d67b6ee49567ccc8f8a3baa3ea0d33c469c0ff Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 17:58:25 +0100 Subject: [PATCH 08/14] Revert "fixed repo" This reverts commit 18e64175880d5ff8c56566676b4254b73d6b9e0f. --- validphys2/src/validphys/pseudodata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 67ed1431b..fd6fafc7f 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -636,7 +636,7 @@ def write_chi2( chi2 = pd.concat([df_ndat, df_chi2], ignore_index=True) - chi2.to_csv(f"{pdf}_chi2_dist.csv", index=False) + chi2.to_csv(f"output/tables/{pdf}_chi2_dist.csv", index=False) _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) From 572ff73ef9743dafa36f4746e18be484f5f4621b Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 17:58:43 +0100 Subject: [PATCH 09/14] Revert "added usage write_chi2" This reverts commit 7875a55cc88be35bf46b57645ecdb8abfb7892eb. --- validphys2/src/validphys/pseudodata.py | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index fd6fafc7f..1d82e4ef8 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -608,26 +608,7 @@ def compute_chi2( return chi2_dict -def write_chi2( - pdf, - compute_chi2, - level0_commondata_wc - ): - - """ - Parameters - ---------- - - pdf: core.PDF - - compute_chi2 - - level0_commondata_wc - - Returns - ------- - - """ +def write_chi2(pdf, compute_chi2, level0_commondata_wc): ndata = {dataset.setname: [dataset.ndata] for dataset in level0_commondata_wc} @@ -636,7 +617,7 @@ def write_chi2( chi2 = pd.concat([df_ndat, df_chi2], ignore_index=True) - chi2.to_csv(f"output/tables/{pdf}_chi2_dist.csv", index=False) + chi2.to_csv(f"{pdf}_chi2_dist.csv", index=False) _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) From 3e5ed9ba788786bb7cb64bf2af99dd3ea496c777 Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 17:59:48 +0100 Subject: [PATCH 10/14] Revert "add chi2 provider functions" This reverts commit eb19f727bb845a76df80e921dd1d44cdbde2503f. --- validphys2/src/validphys/coredata.py | 4 - validphys2/src/validphys/pseudodata.py | 304 +++---------------------- validphys2/src/validphys/results.py | 59 +---- 3 files changed, 39 insertions(+), 328 deletions(-) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index c7f3418d4..4b8c3cd3d 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -263,10 +263,6 @@ def additive_errors(self): add_table.columns = add_systype["name"].to_numpy() return add_table.loc[:, add_table.columns != "SKIP"] - @property - def cuts(self): - indices = self.commondata_table.index - return slice(indices[0]-1, indices[-1]) def systematic_errors(self, central_values=None): """Returns all systematic errors as absolute uncertainties, with a diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 1d82e4ef8..37917b108 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -9,13 +9,10 @@ import numpy as np import pandas as pd -import os -import yaml from validphys.covmats import INTRA_DATASET_SYS_NAME, dataset_t0_predictions -from validphys.convolution import central_predictions -from validphys.loader import Loader +from validphys.core import PDF from reportengine import collect @@ -23,8 +20,6 @@ log = logging.getLogger(__name__) -l = Loader() - DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx']) context_index = collect("groups_index", ("fitcontext",)) @@ -242,10 +237,7 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) -def level0_commondata_wc( - data, - fakepdf - ): +def level0_commondata_wc(data, fakepdf): """ Given a validphys.core.DataGroupSpec object, load commondata and generate a new commondata instance with central values replaced @@ -268,35 +260,33 @@ def level0_commondata_wc( Example ------- >>> from validphys.api import API - >>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], - use_cuts="internal", - theoryid=200, - fakepdf="NNPDF40_nnlo_as_01180") + >>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], use_cuts="internal", theoryid=200, fakepdf="NNPDF40_nnlo_as_01180") [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] """ level0_commondata_instances_wc = [] - # import IPython; IPython.embed() + # argument fakepdf is passed as str + # generate a core.PDF object with name equal to fakepdf argument + # fakepdf = PDF(name=fakepdf) for dataset in data.datasets: - commondata_wc = dataset.commondata.load_commondata() if dataset.cuts is not None: cuts = dataset.cuts.load() - commondata_wc = commondata_wc.with_cuts(cuts=cuts) - + commondata_wc = commondata_wc.with_cuts(cuts) # == Generate a new CommonData instance with central value given by Level 0 data generated with fakepdf ==# - t0_prediction = dataset_t0_predictions(dataset=dataset, - t0set=fakepdf) - # N.B. cuts already applied to th. pred. + t0_prediction = dataset_t0_predictions( + dataset=dataset, t0set=fakepdf + ) # N.B. cuts already applied to th. pred. level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction)) return level0_commondata_instances_wc def make_level1_data( + # data, level0_commondata_wc, filterseed, data_index): @@ -327,6 +317,8 @@ def make_level1_data( Parameters ---------- + REMOVE: data : validphys.core.DataGroupSpec + level0_commondata_wc : list list of validphys.coredata.CommonData instances corresponding to all datasets within one experiment. The central value is replaced @@ -348,20 +340,32 @@ def make_level1_data( ------- >>> from validphys.api import API - >>> API.make_level1_data(dataset_inputs=[{"dataset": "NMC"}], - use_cuts="internal", - theoryid=200, - fakepdf="NNPDF40_nnlo_as_01180", - filterseed=0, - data_index) + >>> dataset='NMC' + >>> l1_cd = API.make_level1_data(dataset_inputs = [{"dataset":dataset}],use_cuts="internal", theoryid=200, + fakepdf = "NNPDF40_nnlo_as_01180", filterseed=1, data_index) + >>> l1_cd [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] """ + # from validphys.covmats import dataset_inputs_covmat_from_systematics + + # dataset_input_list = list(data.dsinputs) + + # covmat = dataset_inputs_covmat_from_systematics( + # level0_commondata_wc, + # dataset_input_list, + # use_weights_in_covmat=False, + # norm_threshold=None, + # _list_of_central_values=None, + # ) + # ================== generation of Level1 data ======================# - level1_data = make_replica(level0_commondata_wc, - filterseed, - genrep=True, - ) + level1_data = make_replica( + level0_commondata_wc, + filterseed, + # covmat, + genrep=True + ) indexed_level1_data = indexed_make_replica(data_index, level1_data) @@ -381,244 +385,6 @@ def make_level1_data( return level1_commondata_instances_wc -def make_level1_list_data( - level0_commondata_wc, - filterseed, - n_samples, - data_index, -): - """ - Given a list of validphys.coredata.CommonData instances with central - values replaced with `fakepdf` predictions with cuts applied - generate a list of level 1 data from such instances - - Parameters - ---------- - - level0_commondata:_wc: list of validphys.coredata.CommonData instances - where the central value is replaced by level 0 - `fakepdf` predictions - - filterseed: int starting seed used to make different replicas - - n_samples: int number of replicas - - data_index: pandas.MultiIndex providing information on the experiment, - the dataset, and the cut index - - Returns - ------- - list - list of lists of validphys.coredata.CommonData instances corresponding - to all datasets within one experiment. The central value is replaced - by Level 1 fake data. - - Example - ------- - >>> from validphys.api import API - >>> from validphys.loader import Loader - >>> from validphys.results import data_index - >>> l = Loader() - >>> dataset = l.check_dataset(name="NMC", theoryid=200) - >>> experiment = l.check_experiment(name="data", datasets=[dataset]) - >>> lv0_cd_wc = API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], - use_cuts="internal", - theoryid=200, - fakepdf="NNPDF40_nnlo_as_01180" - ) - >>> API.make_level1_list_data(level0_commondata_wc=lv0_cd_wc, - filterseed=0, - n_samples=1, - data_index=data_index(experiment) - ) - - [[CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)]] - """ - samples = [make_level1_data(level0_commondata_wc=level0_commondata_wc, - filterseed=filterseed+i, - data_index=data_index) for i in range(n_samples)] - - return samples - - -def sm_predictions( - dataset_inputs, - pdf, - theoryid - ): - - """ - Parameters - ---------- - dataset_inputs: NSList of core.DataSetInput objects - - pdf: core.PDF object - - theoryid: TheoryIDSpec - - Returns - ------- - - dict - dictionary of standard model predictions for the - given dataset_input, pdf, and theory - - """ - - sm_dict = {} - - for dataset in dataset_inputs: - data = l.check_dataset(dataset.name, cfac=dataset.cfac, theoryid=theoryid) - - sm_dict[dataset.name] = central_predictions(data, pdf) - - return sm_dict - - -def load_contamination( - contamination_parameters, - theoryid, - dataset_inputs - ): - - """ - Parameters - ---------- - - contamination_parameters: dict with - - theoryid: TheoryIDSpec - - dataset_inputs: NSList of DataSetInput objects - - Returns - ------- - - dict - dictionary of BSM k-factors to apply on certain datasets - - """ - - cont_path = l.datapath / f"theory_{theoryid.id}" / "simu_factors" - - cont_name = contamination_parameters["name"] - cont_value = contamination_parameters["value"] - cont_lin_comb = contamination_parameters["linear_combination"] - - bsm_dict = {} - - for dataset in dataset_inputs: - - bsmfile = cont_path / f"SIMU_{dataset.name}.yaml" - - cont_order = dataset.contamination - - if cont_order == None: - log.warning( - f"{dataset.name} is not contaminated. Is it right?" - ) - bsm_dict[dataset.name] = np.array([1.]) - elif not os.path.exists(bsmfile): - log.error( - f"Could not find a BSM-factor for {dataset.name}. Are you sure they exist in the given theory?" - ) - bsm_dict[dataset.name] = np.array([1.]) - else: - log.info( - f"Loading {dataset.name}" - ) - with open(bsmfile, "r+") as stream: - simu_card = yaml.safe_load(stream) - stream.close() - - k_factors = np.zeros(len(simu_card["SM_fixed"])) - for op in cont_lin_comb: - k_factors += cont_lin_comb[op] * np.array(simu_card[cont_order][op]) - k_factors = 1. + k_factors * cont_value / np.array(simu_card[cont_order]["SM"]) - - bsm_dict[dataset.name] = k_factors - - return bsm_dict - - -def compute_chi2( - make_level1_list_data, - sm_predictions, - groups_covmat, - load_contamination - ): - - """ - Parameters - ---------- - - make_level1_list_data - - sm_predictions - - groups_covmat - - load_contamination - - Returns - ------- - - dict - dictionary of lists of chi2 per dataset - - """ - - covmat = groups_covmat - samples = make_level1_list_data - bsm_factors = load_contamination - - chi2_dict = {dataset.setname: [] for dataset in samples[0]} - - for sample in samples: - - for dataset in sample: - data_name = dataset.setname - bsm_fac = bsm_factors[data_name] - - if bsm_fac.shape[0] == 1: - data_values = dataset.central_values * bsm_fac - else: - cuts = dataset.cuts - data_values = dataset.central_values * bsm_fac[cuts] - - num_data = dataset.ndata - - covmat_dataset = ( - covmat.xs(data_name, level=1, drop_level=False) - .T.xs(data_name, level=1, drop_level=False) - .values - ) - - theory = sm_predictions[data_name].values.squeeze() - - diff = (data_values - theory).squeeze() - - if diff.size == 1: - chi2 = diff**2 / covmat_dataset[0,0] / num_data - else: - chi2 = (diff.T @ np.linalg.inv(covmat_dataset) @ diff) / num_data - - chi2_dict[data_name].append(chi2) - - return chi2_dict - - -def write_chi2(pdf, compute_chi2, level0_commondata_wc): - - ndata = {dataset.setname: [dataset.ndata] for dataset in level0_commondata_wc} - - df_ndat = pd.DataFrame(ndata) - df_chi2 = pd.DataFrame(compute_chi2) - - chi2 = pd.concat([df_ndat, df_chi2], ignore_index=True) - - chi2.to_csv(f"{pdf}_chi2_dist.csv", index=False) - _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) _recreate_pdf_pseudodata = collect('_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment')) diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 48a9f5222..950a11af1 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -37,8 +37,6 @@ predictions, PredictionsRequireCutsError, ) -from validphys.plotoptions.core import get_info - from validphys.n3fit_data_utils import parse_simu_parameters_names_CF @@ -171,55 +169,6 @@ def from_convolution(cls, pdf, posset): procs_data = collect("data", ("group_dataset_inputs_by_process",)) -def data_index(data): - - """ - Parameters - ---------- - - data: core.DataGroupSpec - - Returns - ------- - - pandas.MultiIndex - - Example - ------- - - >>> from validphys.loader import Loader - >>> from validphys.results import data_index - >>> l = Loader() - >>> dataset = l.check_dataset(name="NMC", - theoryid=200 - ) - >>> experiment = l.check_experiment(name="data", - datasets=[dataset] - ) - >>> data_index(experiment) - - MultiIndex([('NMC', 'NMC', 16), - ('NMC', 'NMC', 21), - ('NMC', 'NMC', 22), - ... - ('NMC', 'NMC', 289), - ('NMC', 'NMC', 290), - ('NMC', 'NMC', 291)], - names=['experiment', 'dataset', 'id'], length=204) - - """ - - tuples = [] - - for dataset in data.datasets: - exp = get_info(dataset).experiment - for i in dataset.cuts.load(): - tp = (exp, dataset.name, i) - tuples.append(tp) - - return pd.MultiIndex.from_tuples(tuples, names=('experiment', 'dataset', 'id')) - - def groups_index(groups_data): """Return a pandas.MultiIndex with levels for group, dataset and point respectively, the group is determined by a key in the dataset metadata, and @@ -592,7 +541,7 @@ def dataset_inputs_results( # ``results`` to support this. # TODO: The above comment doesn't make sense after adding T0. Deprecate this def pdf_results( - dataset: (DataSetSpec, DataGroupSpec), # type: ignore + dataset: (DataSetSpec, DataGroupSpec), pdfs: Sequence, covariance_matrix, sqrt_covmat, @@ -608,12 +557,12 @@ def pdf_results( @require_one("pdfs", "pdf") @remove_outer("pdfs", "pdf") def one_or_more_results( - dataset: (DataSetSpec, DataGroupSpec), # type: ignore + dataset: (DataSetSpec, DataGroupSpec), covariance_matrix, sqrt_covmat, dataset_bsm_factor, - pdfs: (type(None), Sequence) = None, # type: ignore - pdf: (type(None), PDF) = None, # type: ignore + pdfs: (type(None), Sequence) = None, + pdf: (type(None), PDF) = None, ): """Generate a list of results, where the first element is the data values, and the next is either the prediction for pdf or for each of the pdfs. From c9a43b4ddbae8af51162f893550382078eaaab9b Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 17:59:58 +0100 Subject: [PATCH 11/14] Revert "added parse_fakepdf to config.py" This reverts commit 0b5fd161b75083a1de0403815c29c6b0ff6e6763. --- validphys2/src/validphys/config.py | 5 ----- validphys2/src/validphys/pseudodata.py | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index bad0bd964..8375b3f2e 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -38,7 +38,6 @@ MatchedCuts, SimilarCuts, ThCovMatSpec, - PDF, ) from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas from validphys.loader import ( @@ -172,10 +171,6 @@ def parse_pdf(self, name: str): except NotImplementedError as e: raise ConfigError(str(e)) return pdf - - def parse_fakepdf(self, name: str) -> PDF: - """PDF set used to generate the fake data in a closure test.""" - return self.parse_pdf(name) def parse_load_weights_from_fit(self, name: str): """A fit in the results folder, containing at least a valid filter result.""" diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 37917b108..34f2a6c82 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -248,7 +248,7 @@ def level0_commondata_wc(data, fakepdf): data : validphys.core.DataGroupSpec - fakepdf: validphys.core.PDF + fakepdf: str Returns ------- @@ -269,7 +269,7 @@ def level0_commondata_wc(data, fakepdf): # argument fakepdf is passed as str # generate a core.PDF object with name equal to fakepdf argument - # fakepdf = PDF(name=fakepdf) + fakepdf = PDF(name=fakepdf) for dataset in data.datasets: commondata_wc = dataset.commondata.load_commondata() From c2d1900879961e1256896ee57bfb716f5226c787 Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 18:00:06 +0100 Subject: [PATCH 12/14] Revert "added load_commondata to core, level0_commondata_wc and make_level1_data to pseudodata" This reverts commit f6ab038306f80c444e38a407b154f1744366fc17. --- validphys2/src/validphys/core.py | 12 -- validphys2/src/validphys/pseudodata.py | 152 +------------------------ 2 files changed, 1 insertion(+), 163 deletions(-) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index be329c5d9..0772d948a 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -316,18 +316,6 @@ def load(self)->CommonData: #TODO: Use better path handling in python 3.6 return CommonData.ReadFile(str(self.datafile), str(self.sysfile)) - def load_commondata(self, cuts=None): - """ - Loads a coredata.CommonData object from a core.CommonDataSetSpec object - cuts are applied if provided. - """ - # import here to avoid circular imports - from validphys.commondataparser import load_commondata - cd = load_commondata(self) - if cuts is not None: - cd = cd.with_cuts(cuts) - return cd - @property def plot_kinlabels(self): return get_plot_kinlabels(self) diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 34f2a6c82..9fd5863a9 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -10,9 +10,7 @@ import numpy as np import pandas as pd -from validphys.covmats import INTRA_DATASET_SYS_NAME, dataset_t0_predictions - -from validphys.core import PDF +from validphys.covmats import INTRA_DATASET_SYS_NAME from reportengine import collect @@ -237,154 +235,6 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) -def level0_commondata_wc(data, fakepdf): - """ - Given a validphys.core.DataGroupSpec object, load commondata and - generate a new commondata instance with central values replaced - by fakepdf prediction - - Parameters - ---------- - - data : validphys.core.DataGroupSpec - - fakepdf: str - - Returns - ------- - list - list of validphys.coredata.CommonData instances corresponding to - all datasets within one experiment. The central value is replaced - by Level 0 fake data. - - Example - ------- - >>> from validphys.api import API - >>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], use_cuts="internal", theoryid=200, fakepdf="NNPDF40_nnlo_as_01180") - - [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] - """ - - level0_commondata_instances_wc = [] - - # argument fakepdf is passed as str - # generate a core.PDF object with name equal to fakepdf argument - fakepdf = PDF(name=fakepdf) - - for dataset in data.datasets: - commondata_wc = dataset.commondata.load_commondata() - if dataset.cuts is not None: - cuts = dataset.cuts.load() - commondata_wc = commondata_wc.with_cuts(cuts) - # == Generate a new CommonData instance with central value given by Level 0 data generated with fakepdf ==# - t0_prediction = dataset_t0_predictions( - dataset=dataset, t0set=fakepdf - ) # N.B. cuts already applied to th. pred. - level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction)) - - return level0_commondata_instances_wc - - -def make_level1_data( - # data, - level0_commondata_wc, - filterseed, - data_index): - """ - Given a list of Level 0 commondata instances, return the - same list with central values replaced by Level 1 data. - - Level 1 data is generated using validphys.make_replica. - The covariance matrix, from which the stochastic Level 1 - noise is sampled, is built from Level 0 commondata - instances (level0_commondata_wc). This, in particular, - means that the multiplicative systematics are generated - from the Level 0 central values. - - Note that the covariance matrix used to generate Level 2 - pseudodata is consistent with the one used at Level 1 - up to corrections of the order eta * eps, where eta and - eps are defined as shown below: - - Generate L1 data: L1 = L0 + eta, eta ~ N(0,CL0) - Generate L2 data: L2_k = L1 + eps_k, eps_k ~ N(0,CL1) - - where CL0 and CL1 means that the multiplicative entries - have been constructed from Level 0 and Level 1 central - values respectively. - - - Parameters - ---------- - - REMOVE: data : validphys.core.DataGroupSpec - - level0_commondata_wc : list - list of validphys.coredata.CommonData instances corresponding to - all datasets within one experiment. The central value is replaced - by Level 0 fake data. Cuts already applied. - - filterseed : int - random seed used for the generation of Level 1 data - - data_index : pandas.MultiIndex - - Returns - ------- - list - list of validphys.coredata.CommonData instances corresponding to - all datasets within one experiment. The central value is replaced - by Level 1 fake data. - - Example - ------- - - >>> from validphys.api import API - >>> dataset='NMC' - >>> l1_cd = API.make_level1_data(dataset_inputs = [{"dataset":dataset}],use_cuts="internal", theoryid=200, - fakepdf = "NNPDF40_nnlo_as_01180", filterseed=1, data_index) - >>> l1_cd - [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] - """ - - # from validphys.covmats import dataset_inputs_covmat_from_systematics - - # dataset_input_list = list(data.dsinputs) - - # covmat = dataset_inputs_covmat_from_systematics( - # level0_commondata_wc, - # dataset_input_list, - # use_weights_in_covmat=False, - # norm_threshold=None, - # _list_of_central_values=None, - # ) - - # ================== generation of Level1 data ======================# - level1_data = make_replica( - level0_commondata_wc, - filterseed, - # covmat, - genrep=True - ) - - indexed_level1_data = indexed_make_replica(data_index, level1_data) - - dataset_order = {cd.setname: i for i, cd in enumerate(level0_commondata_wc)} - - # ===== create commondata instances with central values given by pseudo_data =====# - level1_commondata_dict = {c.setname: c for c in level0_commondata_wc} - level1_commondata_instances_wc = [] - - for xx, grp in indexed_level1_data.groupby('dataset'): - level1_commondata_instances_wc.append( - level1_commondata_dict[xx].with_central_value(grp.values) - ) - # sort back so as to mantain same order as in level0_commondata_wc - level1_commondata_instances_wc.sort(key=lambda x: dataset_order[x.setname]) - - return level1_commondata_instances_wc - - _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) _recreate_pdf_pseudodata = collect('_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment')) From ec7f28cca5400a0f2f9c83d82fc57e384300e6c3 Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Tue, 21 May 2024 18:14:09 +0100 Subject: [PATCH 13/14] added k-factor loading in dataset_bsm_factor --- validphys2/src/validphys/dataplots.py | 8 ++-- validphys2/src/validphys/results.py | 66 ++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 15 deletions(-) diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index af2e7b657..b307691f1 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -199,7 +199,7 @@ def check_normalize_to(ns, **kwargs): #TODO: This interface is horrible. We need to think how to adapt libnnpdf #to make this use case easier def _plot_fancy_impl(results, commondata, cutlist, - normalize_to:(int,type(None)) = None, labellist=None): + normalize_to:(int,type(None)) = None, labellist=None): # type: ignore """Implementation of the data-theory comparison plots. Providers are supposed to call (yield from) this. @@ -383,7 +383,7 @@ def _plot_fancy_impl(results, commondata, cutlist, @check_normalize_to @figuregen def plot_fancy(one_or_more_results, commondata, cuts, - normalize_to: (int, str, type(None)) = None): + normalize_to: (int, str, type(None)) = None): # type: ignore """ Read the PLOTTING configuration for the dataset and generate the corrspondig data theory plot. @@ -439,7 +439,7 @@ def plot_fancy_dataspecs( dataspecs_commondata, dataspecs_cuts, dataspecs_speclabel, - normalize_to: (str, int, type(None)) = None, + normalize_to: (str, int, type(None)) = None, # type: ignore ): """ General interface for data-theory comparison plots. @@ -1069,7 +1069,7 @@ def plot_xq2( display_cuts:bool=True, marker_by:str='process type', highlight_label:str='highlight', - highlight_datasets:(Sequence,type(None))=None, + highlight_datasets:(Sequence,type(None))=None, # type: ignore aspect:str='landscape', ): """Plot the (x,Q²) coverage based of the data based on some LO diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 950a11af1..583b990ff 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -12,6 +12,8 @@ import numpy as np import pandas as pd import scipy.linalg as la +import os +import yaml from NNPDF import CommonData from reportengine.checks import require_one, remove_outer, check_not_empty @@ -38,11 +40,13 @@ PredictionsRequireCutsError, ) +from validphys.loader import Loader + from validphys.n3fit_data_utils import parse_simu_parameters_names_CF log = logging.getLogger(__name__) - +l = Loader() class Result: pass @@ -308,16 +312,16 @@ def group_result_table_68cl( return res -def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs): +def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): """Same as :py:func:`validphys.results.dataset_bsm_factor` but for a list of dataset inputs. """ res = np.concatenate( - [dataset_bsm_factor(dataset, pdf, read_bsm_facs) for dataset in data.datasets] + [dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters, theoryid) for dataset in data.datasets] ) return res -def dataset_bsm_factor(dataset, pdf, read_bsm_facs): +def dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): """For each replica of ``pdf``, scale the fitted BSM-factors by the best fit value. Returns @@ -330,13 +334,53 @@ def dataset_bsm_factor(dataset, pdf, read_bsm_facs): dataset.simu_parameters_linear_combinations, dataset.cuts ) + + ndata = len(dataset.load().get_cv()) + nrep = len(pdf) - if parsed_bsm_facs is None: + if parsed_bsm_facs is None and dataset.contamination is None: # We want an array of ones that ndata x nrep # where ndata is the number of post cut datapoints - ndata = len(dataset.load().get_cv()) - nrep = len(pdf) return np.ones((ndata, nrep)) + + # after running a contamination fit we do not get fitted bsm factors in a csv + # hence, to produce a comparison between the data used for the fitting and the + # theory prediction of such fit, we must load k-factors in another way + # we write the contamination name, value (e.g. zhat), and linear combination + # in the yaml file, and if the `contamination` keyword is present in the dataset_input + # we access the `SIMU_` file of the desired dataset, and load the k-factors + if dataset.contamination: + + cont_order = dataset.contamination + bsm_path = l.datapath / f"theory_{theoryid.id}" / "simu_factors" + + if contamination_parameters is None: + log.warning("No Contamination parameters provided") + return np.ones((ndata, nrep)) + + else: + bsm_file = bsm_path / f"SIMU_{dataset.name}.yaml" + if not os.path.exists(bsm_file): + log.error(f"Could not find a BSM-factor for {dataset.name}. Are you sure they exist in the given theory?") + return np.ones((ndata, nrep)) + + log.info(f"Loading {dataset.name}") + + cont_name = contamination_parameters["name"] + cont_value = contamination_parameters["value"] + cont_lin_comb = contamination_parameters["linear_combination"] + + with open(bsm_file, "r+") as stream: + simu_card = yaml.safe_load(stream) + stream.close() + + k_factors = np.zeros(len(simu_card["SM_fixed"])) + for op in cont_lin_comb: + k_factors += cont_lin_comb[op] * np.array(simu_card[cont_order][op]) + k_factors = 1. + k_factors * cont_value / np.array(simu_card[cont_order]["SM"]) + + cuts = dataset.cuts.load() + return np.array([k_factors[cuts]] * nrep).T fit_bsm_fac_df = pd.DataFrame( {k: v.central_value for k, v in parsed_bsm_facs.items()} @@ -541,7 +585,7 @@ def dataset_inputs_results( # ``results`` to support this. # TODO: The above comment doesn't make sense after adding T0. Deprecate this def pdf_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore pdfs: Sequence, covariance_matrix, sqrt_covmat, @@ -557,12 +601,12 @@ def pdf_results( @require_one("pdfs", "pdf") @remove_outer("pdfs", "pdf") def one_or_more_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore covariance_matrix, sqrt_covmat, dataset_bsm_factor, - pdfs: (type(None), Sequence) = None, - pdf: (type(None), PDF) = None, + pdfs: (type(None), Sequence) = None, # type: ignore + pdf: (type(None), PDF) = None, # type: ignore ): """Generate a list of results, where the first element is the data values, and the next is either the prediction for pdf or for each of the pdfs. From f580a16dfb6b05ffe088be1d61159a698acca049 Mon Sep 17 00:00:00 2001 From: Francesco Merlotti Date: Wed, 29 May 2024 11:03:05 +0100 Subject: [PATCH 14/14] added plot data-theory comparison contaminated --- ...t_data_theory_comparison_contaminated.yaml | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 validphys2/examples/plot_data_theory_comparison_contaminated.yaml diff --git a/validphys2/examples/plot_data_theory_comparison_contaminated.yaml b/validphys2/examples/plot_data_theory_comparison_contaminated.yaml new file mode 100644 index 000000000..d1c0e6384 --- /dev/null +++ b/validphys2/examples/plot_data_theory_comparison_contaminated.yaml @@ -0,0 +1,48 @@ +meta: + title: TTBAR_13TEV_TTMNORM data/theory comparison baseline + keywords: [contamination, ttbar, zhat] + author: Francesco Merlotti + +pdfs: + - {id: 240516_fm_cont_ttbar_zhat_nojets_08, label: Contaminated fit (Z=8e-4)} + +pdf: 240516_fm_cont_ttbar_zhat_nojets_08 + +fit: 240516_fm_base_zhat_nojets_lv0 + +use_fitcommondata: True + +theoryid: 270 + +use_cuts: internal + +dataset_inputs: + - {dataset: ATLAS_TTBAR_13TEV_TTMNORM, cfac: [QCD], contamination: EFT_LO} + - {dataset: CMS_TTBAR_13TEV_TTMNORM, cfac: [QCD], contamination: EFT_LO} + +contamination_parameters: + name: Z + value: 0.0008 + linear_combination: + OtG: 0. + O8qd: -173.42 + O1qd: 0. + O1qu: 0. + O8qu: -173.42 + O1dt: 0. + O8dt: -173.42 + O1qt: 0. + O8qt: -173.42 + O1ut: 0. + O8ut: -173.42 + O11qq: 0. + O13qq: 0. + O81qq: -173.42 + O83qq: 0. + +template_text: | + {@ dataset_inputs plot_fancy@} + {@ dataset_inputs::pdfs plot_fancy(normalize_to=data)@} + +actions_: + - report(main=true) \ No newline at end of file