Skip to content

Commit

Permalink
Contamination now working :)
Browse files Browse the repository at this point in the history
  • Loading branch information
James Moore committed Oct 10, 2023
1 parent 849888f commit 712712b
Show file tree
Hide file tree
Showing 9 changed files with 74 additions and 18 deletions.
4 changes: 3 additions & 1 deletion libnnpdf/src/NNPDF/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ namespace NNPDF

bool UseFixedPredictions; //!< Flag to indicate using fixed predictions from simu_factor file
int SpecialTheoryID; //!< The special hacky theory ID
std::vector<float> ContaminationFactor; //!< The factor for contamination in closure tests

// private methods for constructor
void GenCovMat() const; //!< Generate covariance matrix
Expand All @@ -48,7 +49,7 @@ namespace NNPDF
DataSet(); //!< Disable default constructor

public:
DataSet(CommonData const&, FKSet const&, double weight=1., bool use_fixed_predictions=false, int special_theory_id=0); //!< Constructor
DataSet(CommonData const&, FKSet const&, std::vector<float> contamination_factor, double weight=1., bool use_fixed_predictions=false, int special_theory_id=0); //!< Constructor
DataSet(const DataSet&, std::vector<int> const&); //!< Masked Copy constructor
DataSet(const DataSet&) = default; //!< Masked Copy constructor
virtual ~DataSet(); //!< The destructor.
Expand All @@ -63,6 +64,7 @@ namespace NNPDF

bool GetUseFixedPredictions() const {return UseFixedPredictions; } //!< Getter for UseFixedPredictions
int GetSpecialTheoryID() const {return SpecialTheoryID; } //!< The special hacky theory ID
std::vector<float> GetContaminationFactor() const {return ContaminationFactor; } //!< The contamination factor

double const& GetT0Pred(int i) const { return fT0Pred[i];} //!< Return t0 prediction
double GetWeight() const {return fWeight;}
Expand Down
8 changes: 5 additions & 3 deletions libnnpdf/src/dataset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,15 @@ using namespace NNPDF;
* \param weight the factor by which the importance of the dataset in the fit chi²
* is increased.
*/
DataSet::DataSet(CommonData const& data, FKSet const& set, double weight, bool use_fixed_predictions, int special_theory_id):
DataSet::DataSet(CommonData const& data, FKSet const& set, std::vector<float> contamination_factor, double weight, bool use_fixed_predictions, int special_theory_id):
CommonData(data),
FKSet(set),
fIsArtificial(false),
fIsT0(false),
fWeight(weight),
UseFixedPredictions(use_fixed_predictions),
SpecialTheoryID(special_theory_id)
SpecialTheoryID(special_theory_id),
ContaminationFactor(contamination_factor)
{
fT0Pred.reserve(fNData);

Expand All @@ -58,7 +59,8 @@ DataSet::DataSet(const DataSet& set, std::vector<int> const& mask):
fIsT0(set.fIsT0),
fWeight(set.fWeight),
UseFixedPredictions(set.UseFixedPredictions),
SpecialTheoryID(set.SpecialTheoryID)
SpecialTheoryID(set.SpecialTheoryID),
ContaminationFactor(set.ContaminationFactor)
{
fT0Pred.reserve(fNData);

Expand Down
8 changes: 5 additions & 3 deletions libnnpdf/src/experiments.cc
Original file line number Diff line number Diff line change
Expand Up @@ -407,16 +407,18 @@ void Experiment::MakeClosure(const vector<ThPredictions>& predictions, bool cons
for (int i = 0; i < set.GetNData(); i++)
{
newdata[i] = theory.GetObsCV(i);
// This is our opportunity to CONTAMINATE things, and sort out the fixed observables

// Sort out fixed observables first, if necessary
if (set.GetUseFixedPredictions())
{
string fixed_prediction_path = get_data_path() + "theory_" + std::to_string(set.GetSpecialTheoryID()) + "/simu_factors/SIMU_" + set.GetSetName() + ".yaml";
cout << fixed_prediction_path << endl;
YAML::Node read_file = YAML::LoadFile(fixed_prediction_path);

std::vector<float> fixed_predictions = read_file["SM_fixed"].as<std::vector<float>>();
newdata[i] = fixed_predictions[i];
}

// Finally, add the contamination
newdata[i] = newdata[i]*set.GetContaminationFactor()[i];
}

set.UpdateData(newdata.data()); // MakeClosure treated as shifts rather than normalisations
Expand Down
1 change: 1 addition & 0 deletions libnnpdf/wrapper/src/nnpdf.i
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
%template(vector_str) std::vector<std::string>;
%template(vector_double) std::vector<double>;
%template(vector_int) std::vector<int>;
%template(vector_float) std::vector<float>;

%typemap(out) NNPDF::matrix<double> {
auto size = $1.size(0)*$1.size(1);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ description: "Runcard template for the new (more flexible) contaminated fits. Th
############################################################
dataset_inputs:
- {dataset: NMC, frac: 0.75}
- {dataset: ATLASTTBARTOT7TEV, cfac: [QCD], contamination: 'EFT-LO'}
- {dataset: ATLAS_TOPDIFF_DILEPT_8TEV_TTMNORM, cfac: [QCD], contamination: 'EFT-LO'}
- {dataset: ATLAS_TTBAR_8TEV_ASY, cfac: [QCD], contamination: 'EFT-LO'}
- {dataset: ATLAS_SINGLETOPW_8TEV_TOTAL, use_fixed_predictions: True, contamination: 'EFT-LO'}
- {dataset: ATLASTTBARTOT7TEV, cfac: [QCD], contamination: 'EFT_LO'}
- {dataset: ATLAS_TOPDIFF_DILEPT_8TEV_TTMNORM, cfac: [QCD], contamination: 'EFT_LO'}
- {dataset: ATLAS_TTBAR_8TEV_ASY, cfac: [QCD], contamination: 'EFT_LO'}
- {dataset: ATLAS_SINGLETOPW_8TEV_TOTAL, use_fixed_predictions: True, contamination: 'EFT_LO'}

fixed_pdf_fit: False
# load_weights_from_fit: 221103-jmm-no_top_1000_iterated # If this is uncommented, training starts here.
Expand All @@ -27,8 +27,8 @@ closuretest:
rancuttrnval: false # 0(1) to output training(valiation) chi2 in report
printpdf4gen: false # To print info on PDFs during minimization
contamination_parameters:
- {name: 'OtG', value: 0.01}
- {name: 'Opt', value: 0.02}
- {name: 'OtG', value: 10}
- {name: 'Opt', value: 1}

seed: 0
rngalgo: 0
Expand Down
6 changes: 5 additions & 1 deletion nnpdfcpp/src/common/src/loadutils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@ DataSet LoadDataSet(NNPDFSettings const &settings, std::string const &setname,
if (mask.size() > 0) {
fk = FKSet(fk, mask);
}
return DataSet(cd, fk, weight);

// Since this is outside the contamination pipeline, set the contamination factor to be 1.
std::vector<float> contamination_factor(cd.GetNData(), 1.0);

return DataSet(cd, fk, contamination_factor, weight);
}

/**
Expand Down
15 changes: 14 additions & 1 deletion validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ def produce_simu_parameters_scales(self, simu_parameters=None):
def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_parameters_scales, n_simu_parameters, simu_parameters=None):
"""The mapping that corresponds to the dataset specifications in the
fit files"""
known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions"}
known_keys = {"dataset", "sys", "cfac", "frac", "weight", "custom_group", "simu_fac", "use_fixed_predictions", "contamination"}
try:
name = dataset["dataset"]
if not isinstance(name, str):
Expand All @@ -507,6 +507,7 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para
cfac = dataset.get("cfac", tuple())
frac = dataset.get("frac", 1)
use_fixed_predictions = dataset.get("use_fixed_predictions", False)
contamination = dataset.get("contamination", None)
if not isinstance(frac, numbers.Real):
raise ConfigError(f"'frac' must be a number, not '{frac}'")
if frac < 0 or frac > 1:
Expand Down Expand Up @@ -542,6 +543,7 @@ def parse_dataset_input(self, dataset: Mapping, simu_parameters_names, simu_para
weight=weight,
custom_group=custom_group,
use_fixed_predictions=use_fixed_predictions,
contamination=contamination,
**bsm_data
)

Expand Down Expand Up @@ -698,6 +700,12 @@ def produce_cuts(self, *, commondata, use_cuts):
return self._produce_similarity_cuts(commondata)
raise TypeError("Wrong use_cuts")

def produce_contamination_data(self, closuretest):
if "contamination_parameters" in closuretest.keys():
return closuretest["contamination_parameters"]
else:
return None

def produce_dataset(
self,
*,
Expand All @@ -707,6 +715,7 @@ def produce_dataset(
use_fitcommondata=False,
fit=None,
check_plotting: bool = False,
contamination_data = None,
):
"""Dataset specification from the theory and CommonData.
Use the cuts from the fit, if provided. If check_plotting is set to
Expand All @@ -719,6 +728,8 @@ def produce_dataset(
weight = dataset_input.weight
simu_parameters_names = dataset_input.simu_parameters_names
use_fixed_predictions = dataset_input.use_fixed_predictions
contamination = dataset_input.contamination
contamination_data = contamination_data

try:
ds = self.loader.check_dataset(
Expand All @@ -733,6 +744,8 @@ def produce_dataset(
weight=weight,
simu_parameters_names=simu_parameters_names,
use_fixed_predictions=use_fixed_predictions,
contamination=contamination,
contamination_data=contamination_data,
)
except DataNotFoundError as e:
raise ConfigError(str(e), name, self.loader.available_datasets)
Expand Down
34 changes: 31 additions & 3 deletions validphys2/src/validphys/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def plot_kinlabels(self):
class DataSetInput(TupleComp):
"""Represents whatever the user enters in the YAML to specify a
dataset."""
def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, use_fixed_predictions):
def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_parameters_names, use_fixed_predictions, contamination):
self.name=name
self.sys=sys
self.cfac = cfac
Expand All @@ -333,6 +333,7 @@ def __init__(self, *, name, sys, cfac, frac, weight, custom_group, simu_paramete
self.custom_group = custom_group
self.simu_parameters_names = simu_parameters_names
self.use_fixed_predictions = use_fixed_predictions
self.contamination = contamination
super().__init__(name, sys, cfac, frac, weight, custom_group)

def __str__(self):
Expand Down Expand Up @@ -461,10 +462,12 @@ def cut_mask(cuts):
class DataSetSpec(TupleComp):

def __init__(self, *, name, commondata, fkspecs, thspec, cuts,
frac=1, op=None, weight=1, simu_parameters_names_CF=None, simu_parameters_names=None, use_fixed_predictions=False):
frac=1, op=None, weight=1, simu_parameters_names_CF=None, simu_parameters_names=None, use_fixed_predictions=False, contamination=None, contamination_data=None):
self.name = name
self.commondata = commondata
self.use_fixed_predictions = use_fixed_predictions
self.contamination = contamination
self.contamination_data = contamination_data

if isinstance(fkspecs, FKTableSpec):
fkspecs = (fkspecs,)
Expand Down Expand Up @@ -503,7 +506,32 @@ def load(self):

fkset = FKSet(FKSet.parseOperator(self.op), fktables)

data = DataSet(cd, fkset, self.weight, self.use_fixed_predictions, int(self.thspec.id))
if self.contamination:
# Determine the multiplicative factors that we have to contaminate the closure test
# pseudodata with.
simu_fac_path = str(self.fkspecs[0].fkpath).split('fastkernel')[0] + "simu_factors/SIMU_" + cd.GetSetName() + ".yaml"
with open(simu_fac_path, 'rb') as file:
simu_file = yaml.safe_load(file)
contamination_values = np.array([1.0]*cd.GetNData())
for parameter in self.contamination_data:
if parameter['name'] in simu_file[self.contamination].keys():
op_prediction = parameter['value']*np.array(simu_file[self.contamination][parameter['name']])
sm_prediction = np.array(simu_file[self.contamination]['SM'])
percentages = op_prediction / sm_prediction
contamination_values += percentages
contamination_values = contamination_values.tolist()
else:
# The contamination is just an array of 1s
contamination_values = [1.0]*cd.GetNData()

data = DataSet(
cd,
fkset,
contamination_values,
self.weight,
self.use_fixed_predictions,
int(self.thspec.id),
)

if self.cuts is not None:
#ugly need to convert from numpy.int64 to int, so we can pass
Expand Down
4 changes: 4 additions & 0 deletions validphys2/src/validphys/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,8 @@ def check_dataset(
weight=1,
simu_parameters_names=None,
use_fixed_predictions=False,
contamination=None,
contamination_data=None,
):

if not isinstance(theoryid, TheoryIDSpec):
Expand Down Expand Up @@ -595,6 +597,8 @@ def check_dataset(
simu_parameters_names_CF=simu_parameters_names_CF,
simu_parameters_names=simu_parameters_names,
use_fixed_predictions=use_fixed_predictions,
contamination=contamination,
contamination_data=contamination_data,
)

def check_experiment(self, name: str, datasets: List[DataSetSpec]) -> DataGroupSpec:
Expand Down

0 comments on commit 712712b

Please sign in to comment.