diff --git a/validphys2/examples/plot_data_theory_comparison_contaminated.yaml b/validphys2/examples/plot_data_theory_comparison_contaminated.yaml new file mode 100644 index 000000000..d1c0e6384 --- /dev/null +++ b/validphys2/examples/plot_data_theory_comparison_contaminated.yaml @@ -0,0 +1,48 @@ +meta: + title: TTBAR_13TEV_TTMNORM data/theory comparison baseline + keywords: [contamination, ttbar, zhat] + author: Francesco Merlotti + +pdfs: + - {id: 240516_fm_cont_ttbar_zhat_nojets_08, label: Contaminated fit (Z=8e-4)} + +pdf: 240516_fm_cont_ttbar_zhat_nojets_08 + +fit: 240516_fm_base_zhat_nojets_lv0 + +use_fitcommondata: True + +theoryid: 270 + +use_cuts: internal + +dataset_inputs: + - {dataset: ATLAS_TTBAR_13TEV_TTMNORM, cfac: [QCD], contamination: EFT_LO} + - {dataset: CMS_TTBAR_13TEV_TTMNORM, cfac: [QCD], contamination: EFT_LO} + +contamination_parameters: + name: Z + value: 0.0008 + linear_combination: + OtG: 0. + O8qd: -173.42 + O1qd: 0. + O1qu: 0. + O8qu: -173.42 + O1dt: 0. + O8dt: -173.42 + O1qt: 0. + O8qt: -173.42 + O1ut: 0. + O8ut: -173.42 + O11qq: 0. + O13qq: 0. + O81qq: -173.42 + O83qq: 0. + +template_text: | + {@ dataset_inputs plot_fancy@} + {@ dataset_inputs::pdfs plot_fancy(normalize_to=data)@} + +actions_: + - report(main=true) \ No newline at end of file diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index af2e7b657..b307691f1 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -199,7 +199,7 @@ def check_normalize_to(ns, **kwargs): #TODO: This interface is horrible. We need to think how to adapt libnnpdf #to make this use case easier def _plot_fancy_impl(results, commondata, cutlist, - normalize_to:(int,type(None)) = None, labellist=None): + normalize_to:(int,type(None)) = None, labellist=None): # type: ignore """Implementation of the data-theory comparison plots. Providers are supposed to call (yield from) this. @@ -383,7 +383,7 @@ def _plot_fancy_impl(results, commondata, cutlist, @check_normalize_to @figuregen def plot_fancy(one_or_more_results, commondata, cuts, - normalize_to: (int, str, type(None)) = None): + normalize_to: (int, str, type(None)) = None): # type: ignore """ Read the PLOTTING configuration for the dataset and generate the corrspondig data theory plot. @@ -439,7 +439,7 @@ def plot_fancy_dataspecs( dataspecs_commondata, dataspecs_cuts, dataspecs_speclabel, - normalize_to: (str, int, type(None)) = None, + normalize_to: (str, int, type(None)) = None, # type: ignore ): """ General interface for data-theory comparison plots. @@ -1069,7 +1069,7 @@ def plot_xq2( display_cuts:bool=True, marker_by:str='process type', highlight_label:str='highlight', - highlight_datasets:(Sequence,type(None))=None, + highlight_datasets:(Sequence,type(None))=None, # type: ignore aspect:str='landscape', ): """Plot the (x,Q²) coverage based of the data based on some LO diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 950a11af1..583b990ff 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -12,6 +12,8 @@ import numpy as np import pandas as pd import scipy.linalg as la +import os +import yaml from NNPDF import CommonData from reportengine.checks import require_one, remove_outer, check_not_empty @@ -38,11 +40,13 @@ PredictionsRequireCutsError, ) +from validphys.loader import Loader + from validphys.n3fit_data_utils import parse_simu_parameters_names_CF log = logging.getLogger(__name__) - +l = Loader() class Result: pass @@ -308,16 +312,16 @@ def group_result_table_68cl( return res -def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs): +def dataset_inputs_bsm_factor(data, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): """Same as :py:func:`validphys.results.dataset_bsm_factor` but for a list of dataset inputs. """ res = np.concatenate( - [dataset_bsm_factor(dataset, pdf, read_bsm_facs) for dataset in data.datasets] + [dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters, theoryid) for dataset in data.datasets] ) return res -def dataset_bsm_factor(dataset, pdf, read_bsm_facs): +def dataset_bsm_factor(dataset, pdf, read_bsm_facs, contamination_parameters=None, theoryid=None): """For each replica of ``pdf``, scale the fitted BSM-factors by the best fit value. Returns @@ -330,13 +334,53 @@ def dataset_bsm_factor(dataset, pdf, read_bsm_facs): dataset.simu_parameters_linear_combinations, dataset.cuts ) + + ndata = len(dataset.load().get_cv()) + nrep = len(pdf) - if parsed_bsm_facs is None: + if parsed_bsm_facs is None and dataset.contamination is None: # We want an array of ones that ndata x nrep # where ndata is the number of post cut datapoints - ndata = len(dataset.load().get_cv()) - nrep = len(pdf) return np.ones((ndata, nrep)) + + # after running a contamination fit we do not get fitted bsm factors in a csv + # hence, to produce a comparison between the data used for the fitting and the + # theory prediction of such fit, we must load k-factors in another way + # we write the contamination name, value (e.g. zhat), and linear combination + # in the yaml file, and if the `contamination` keyword is present in the dataset_input + # we access the `SIMU_` file of the desired dataset, and load the k-factors + if dataset.contamination: + + cont_order = dataset.contamination + bsm_path = l.datapath / f"theory_{theoryid.id}" / "simu_factors" + + if contamination_parameters is None: + log.warning("No Contamination parameters provided") + return np.ones((ndata, nrep)) + + else: + bsm_file = bsm_path / f"SIMU_{dataset.name}.yaml" + if not os.path.exists(bsm_file): + log.error(f"Could not find a BSM-factor for {dataset.name}. Are you sure they exist in the given theory?") + return np.ones((ndata, nrep)) + + log.info(f"Loading {dataset.name}") + + cont_name = contamination_parameters["name"] + cont_value = contamination_parameters["value"] + cont_lin_comb = contamination_parameters["linear_combination"] + + with open(bsm_file, "r+") as stream: + simu_card = yaml.safe_load(stream) + stream.close() + + k_factors = np.zeros(len(simu_card["SM_fixed"])) + for op in cont_lin_comb: + k_factors += cont_lin_comb[op] * np.array(simu_card[cont_order][op]) + k_factors = 1. + k_factors * cont_value / np.array(simu_card[cont_order]["SM"]) + + cuts = dataset.cuts.load() + return np.array([k_factors[cuts]] * nrep).T fit_bsm_fac_df = pd.DataFrame( {k: v.central_value for k, v in parsed_bsm_facs.items()} @@ -541,7 +585,7 @@ def dataset_inputs_results( # ``results`` to support this. # TODO: The above comment doesn't make sense after adding T0. Deprecate this def pdf_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore pdfs: Sequence, covariance_matrix, sqrt_covmat, @@ -557,12 +601,12 @@ def pdf_results( @require_one("pdfs", "pdf") @remove_outer("pdfs", "pdf") def one_or_more_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore covariance_matrix, sqrt_covmat, dataset_bsm_factor, - pdfs: (type(None), Sequence) = None, - pdf: (type(None), PDF) = None, + pdfs: (type(None), Sequence) = None, # type: ignore + pdf: (type(None), PDF) = None, # type: ignore ): """Generate a list of results, where the first element is the data values, and the next is either the prediction for pdf or for each of the pdfs.