Skip to content

Commit

Permalink
Merge pull request #69 from HEP-PBSP/level0_commondata_wc
Browse files Browse the repository at this point in the history
`load_commondata`, `level0_commondata_wc`, and `make_level1_data`
  • Loading branch information
FrancescoMerlotti authored Jun 18, 2024
2 parents 9b16111 + 57099f8 commit c3ce520
Show file tree
Hide file tree
Showing 6 changed files with 488 additions and 11 deletions.
5 changes: 5 additions & 0 deletions validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
MatchedCuts,
SimilarCuts,
ThCovMatSpec,
PDF,
)
from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas
from validphys.loader import (
Expand Down Expand Up @@ -171,6 +172,10 @@ def parse_pdf(self, name: str):
except NotImplementedError as e:
raise ConfigError(str(e))
return pdf

def parse_fakepdf(self, name: str) -> PDF:
"""PDF set used to generate the fake data in a closure test."""
return self.parse_pdf(name)

def parse_load_weights_from_fit(self, name: str):
"""A fit in the results folder, containing at least a valid filter result."""
Expand Down
12 changes: 12 additions & 0 deletions validphys2/src/validphys/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,18 @@ def load(self)->CommonData:
#TODO: Use better path handling in python 3.6
return CommonData.ReadFile(str(self.datafile), str(self.sysfile))

def load_commondata(self, cuts=None):
"""
Loads a coredata.CommonData object from a core.CommonDataSetSpec object
cuts are applied if provided.
"""
# import here to avoid circular imports
from validphys.commondataparser import load_commondata
cd = load_commondata(self)
if cuts is not None:
cd = cd.with_cuts(cuts)
return cd

@property
def plot_kinlabels(self):
return get_plot_kinlabels(self)
Expand Down
3 changes: 3 additions & 0 deletions validphys2/src/validphys/coredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ def additive_errors(self):
add_table.columns = add_systype["name"].to_numpy()
return add_table.loc[:, add_table.columns != "SKIP"]

@property
def commondata_table_indices(self):
return self.commondata_table.index - 1

def systematic_errors(self, central_values=None):
"""Returns all systematic errors as absolute uncertainties, with a
Expand Down
242 changes: 241 additions & 1 deletion validphys2/src/validphys/pseudodata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,22 @@

import numpy as np
import pandas as pd
import os
import yaml

from validphys.covmats import INTRA_DATASET_SYS_NAME
from validphys.covmats import INTRA_DATASET_SYS_NAME, dataset_t0_predictions

from validphys.convolution import central_predictions
from validphys.loader import Loader

from reportengine import collect

FILE_PREFIX = "datacuts_theory_fitting_"

log = logging.getLogger(__name__)

l = Loader()

DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx'])

context_index = collect("groups_index", ("fitcontext",))
Expand Down Expand Up @@ -235,6 +242,239 @@ def indexed_make_replica(groups_index, make_replica):
return pd.DataFrame(make_replica, index=groups_index, columns=["data"])


def level0_commondata_wc(
data,
fakepdf
):
"""
Given a validphys.core.DataGroupSpec object, load commondata and
generate a new commondata instance with central values replaced
by fakepdf prediction
Parameters
----------
data : validphys.core.DataGroupSpec
fakepdf: validphys.core.PDF
Returns
-------
list
list of validphys.coredata.CommonData instances corresponding to
all datasets within one experiment. The central value is replaced
by Level 0 fake data.
Example
-------
>>> from validphys.api import API
>>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}],
use_cuts="internal",
theoryid=200,
fakepdf="NNPDF40_nnlo_as_01180")
[CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)]
"""

level0_commondata_instances_wc = []

# import IPython; IPython.embed()

for dataset in data.datasets:

commondata_wc = dataset.commondata.load_commondata()
if dataset.cuts is not None:
cuts = dataset.cuts.load()
commondata_wc = commondata_wc.with_cuts(cuts=cuts)

# == Generate a new CommonData instance with central value given by Level 0 data generated with fakepdf ==#
t0_prediction = dataset_t0_predictions(dataset=dataset,
t0set=fakepdf)
# N.B. cuts already applied to th. pred.
level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction))

return level0_commondata_instances_wc


def make_level1_data(
level0_commondata_wc,
filterseed,
data_index):
"""
Given a list of Level 0 commondata instances, return the
same list with central values replaced by Level 1 data.
Level 1 data is generated using validphys.make_replica.
The covariance matrix, from which the stochastic Level 1
noise is sampled, is built from Level 0 commondata
instances (level0_commondata_wc). This, in particular,
means that the multiplicative systematics are generated
from the Level 0 central values.
Note that the covariance matrix used to generate Level 2
pseudodata is consistent with the one used at Level 1
up to corrections of the order eta * eps, where eta and
eps are defined as shown below:
Generate L1 data: L1 = L0 + eta, eta ~ N(0,CL0)
Generate L2 data: L2_k = L1 + eps_k, eps_k ~ N(0,CL1)
where CL0 and CL1 means that the multiplicative entries
have been constructed from Level 0 and Level 1 central
values respectively.
Parameters
----------
level0_commondata_wc : list
list of validphys.coredata.CommonData instances corresponding to
all datasets within one experiment. The central value is replaced
by Level 0 fake data. Cuts already applied.
filterseed : int
random seed used for the generation of Level 1 data
data_index : pandas.MultiIndex
Returns
-------
list
list of validphys.coredata.CommonData instances corresponding to
all datasets within one experiment. The central value is replaced
by Level 1 fake data.
Example
-------
>>> from validphys.api import API
>>> API.make_level1_data(dataset_inputs=[{"dataset": "NMC"}],
use_cuts="internal",
theoryid=200,
fakepdf="NNPDF40_nnlo_as_01180",
filterseed=0,
data_index)
[CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)]
"""

# ================== generation of Level1 data ======================#
level1_data = make_replica(level0_commondata_wc,
filterseed,
genrep=True,
)

indexed_level1_data = indexed_make_replica(data_index, level1_data)

dataset_order = {cd.setname: i for i, cd in enumerate(level0_commondata_wc)}

# ===== create commondata instances with central values given by pseudo_data =====#
level1_commondata_dict = {c.setname: c for c in level0_commondata_wc}
level1_commondata_instances_wc = []

for xx, grp in indexed_level1_data.groupby('dataset'):
level1_commondata_instances_wc.append(
level1_commondata_dict[xx].with_central_value(grp.values)
)
# sort back so as to mantain same order as in level0_commondata_wc
level1_commondata_instances_wc.sort(key=lambda x: dataset_order[x.setname])

return level1_commondata_instances_wc


def make_level1_list_data(
level0_commondata_wc,
filterseed,
n_samples,
data_index,
):
"""
Given a list of validphys.coredata.CommonData instances with central
values replaced with `fakepdf` predictions with cuts applied
generate a list of level 1 data from such instances
Parameters
----------
level0_commondata:_wc: list of validphys.coredata.CommonData instances
where the central value is replaced by level 0
`fakepdf` predictions
filterseed: int starting seed used to make different replicas
n_samples: int number of replicas
data_index: pandas.MultiIndex providing information on the experiment,
the dataset, and the cut index
Returns
-------
list
list of lists of validphys.coredata.CommonData instances corresponding
to all datasets within one experiment. The central value is replaced
by Level 1 fake data.
Example
-------
>>> from validphys.api import API
>>> from validphys.loader import Loader
>>> from validphys.results import data_index
>>> l = Loader()
>>> dataset = l.check_dataset(name="NMC", theoryid=200)
>>> experiment = l.check_experiment(name="data", datasets=[dataset])
>>> lv0_cd_wc = API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}],
use_cuts="internal",
theoryid=200,
fakepdf="NNPDF40_nnlo_as_01180"
)
>>> API.make_level1_list_data(level0_commondata_wc=lv0_cd_wc,
filterseed=0,
n_samples=1,
data_index=data_index(experiment)
)
[[CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)]]
"""
samples = [make_level1_data(level0_commondata_wc=level0_commondata_wc,
filterseed=filterseed+i,
data_index=data_index) for i in range(n_samples)]

return samples


def sm_predictions(
dataset_inputs,
pdf,
theoryid
):

"""
Parameters
----------
dataset_inputs: NSList of core.DataSetInput objects
pdf: core.PDF object
theoryid: TheoryIDSpec
Returns
-------
dict
dictionary of standard model predictions for the
given dataset_input, pdf, and theory
"""

sm_dict = {}

for dataset in dataset_inputs:
data = l.check_dataset(dataset.name, cfac=dataset.cfac, theoryid=theoryid)

sm_dict[dataset.name] = central_predictions(data, pdf)

return sm_dict


_group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',))
_recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment'))
_recreate_pdf_pseudodata = collect('_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment'))
Expand Down
Loading

0 comments on commit c3ce520

Please sign in to comment.