From 67f4185611e6ec1ff194ff61f7b0cc3b974bab3c Mon Sep 17 00:00:00 2001
From: James Moore <jmm232@login-p-1.data.cluster>
Date: Wed, 22 Nov 2023 11:01:33 +0000
Subject: [PATCH 1/5] Added theory uncertainties for fit

---
 n3fit/src/n3fit/model_gen.py     | 37 +++++++++++++++++++++++++++++++-
 n3fit/src/n3fit/model_trainer.py |  6 +++++-
 n3fit/src/n3fit/performfit.py    |  3 +++
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
index 96db1d2ea..c47cd399e 100644
--- a/n3fit/src/n3fit/model_gen.py
+++ b/n3fit/src/n3fit/model_gen.py
@@ -13,6 +13,11 @@
 from importlib.util import spec_from_loader
 import numpy as np
 import tensorflow as tf
+import yaml
+
+import os
+
+import scipy as sp
 
 from validphys import bsmnames
 
@@ -25,6 +30,8 @@
 from n3fit.backends import base_layer_selector, regularizer_selector
 from n3fit.layers.CombineCfac import CombineCfacLayer
 
+from validphys.loader import _get_nnpdf_profile
+
 import logging
 log = logging.getLogger(__name__)
 
@@ -148,7 +155,12 @@ def __call__(self, pdf_layer, mask=None):
 
 
 def observable_generator(
-    spec_dict, positivity_initial=1.0, integrability=False, post_observable=None
+    spec_dict, 
+    positivity_initial=1.0, 
+    integrability=False, 
+    post_observable=None, 
+    use_th_covmat=False,
+    theoryid=None,
 ):  # pylint: disable=too-many-locals
     """
     This function generates the observable model for each experiment.
@@ -318,6 +330,29 @@ def observable_generator(
         obsrot_tr = None
         obsrot_vl = None
 
+    if use_th_covmat:
+        data_path = str(_get_nnpdf_profile()['data_path']) + 'theory_' + theoryid.id + '/simu_factors'
+        # If using the theory covariance matrix, we must build it here.
+        th_covmats = []
+        for ds in spec_dict.get("datasets"):
+            simu_fac_path = data_path + '/SIMU_' + ds['name'] + '.yaml'
+            if os.path.exists(simu_fac_path):
+                with open(simu_fac_path, 'rb') as file:
+                    simu_file = yaml.safe_load(file)
+                if 'theory_cov' in simu_file.keys():
+                    th_covmats += [simu_file['theory_cov']]
+                else:
+                    th_covmats += [np.zeros((ds['ndata'],ds['ndata']))]
+            else:
+                th_covmats += [np.zeros((ds['ndata'],ds['ndata']))]
+
+        th_covmat = sp.linalg.block_diag(*th_covmats)
+
+        if not np.all((th_covmat == 0.0)):
+            covmat = th_covmat + spec_dict["covmat"]
+            invcovmat = np.linalg.inv(covmat)
+            spec_dict["invcovmat"] = invcovmat 
+
     out_tr = ObservableWrapper(
         spec_name,
         model_obs_tr,
diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
index f43f1170b..76f4c32d6 100644
--- a/n3fit/src/n3fit/model_trainer.py
+++ b/n3fit/src/n3fit/model_trainer.py
@@ -131,6 +131,8 @@ def __init__(
         model_file=None,
         sum_rules=None,
         parallel_models=1,
+        use_th_covmat=False,
+        theoryid=None,
     ):
         """
         Parameters
@@ -190,6 +192,8 @@ def __init__(
         self.bsm_initialisation_seed = bsm_initialisation_seed
         self.fixed_pdf = fixed_pdf
         self.replicas = replicas
+        self.use_th_covmat = use_th_covmat
+        self.theoryid = theoryid
 
         # Initialise internal variables which define behaviour
         if debug:
@@ -522,7 +526,7 @@ def _generate_observables(
             if not self.mode_hyperopt:
                 log.info("Generating layers for experiment %s", exp_dict["name"])
 
-            exp_layer = model_gen.observable_generator(exp_dict, post_observable=combiner)
+            exp_layer = model_gen.observable_generator(exp_dict, post_observable=combiner, use_th_covmat=self.use_th_covmat, theoryid=self.theoryid)
 
             # Save the input(s) corresponding to this experiment
             self.input_list += exp_layer["inputs"]
diff --git a/n3fit/src/n3fit/performfit.py b/n3fit/src/n3fit/performfit.py
index f05a011bb..fb527eac4 100644
--- a/n3fit/src/n3fit/performfit.py
+++ b/n3fit/src/n3fit/performfit.py
@@ -44,6 +44,7 @@ def performfit(
     parallel_models=False, 
     simu_parameters_names=None,
     bsm_initialisation_seed=0,
+    use_th_covmat=False,
 ):
     """
         This action will (upon having read a validcard) process a full PDF fit
@@ -208,6 +209,8 @@ def performfit(
             simu_parameters_scales=simu_parameters_scales,
             bsm_fac_initialisations=bsm_fac_initialisations,
             bsm_initialisation_seed=bsm_initialisation_seed,
+            use_th_covmat=use_th_covmat,
+            theoryid=theoryid,
         )
 
         # This is just to give a descriptive name to the fit function

From bb22e014efce8be93a3a0902223442b0f44c0739 Mon Sep 17 00:00:00 2001
From: James Moore <jmm232@login-p-1.data.cluster>
Date: Wed, 22 Nov 2023 12:25:10 +0000
Subject: [PATCH 2/5] Chi2s now have theory covariance matrix included

---
 validphys2/src/validphys/results.py | 47 ++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py
index cea92110d..80ed041a1 100644
--- a/validphys2/src/validphys/results.py
+++ b/validphys2/src/validphys/results.py
@@ -12,6 +12,9 @@
 import numpy as np
 import pandas as pd
 import scipy.linalg as la
+import os
+import yaml
+import scipy as sp
 
 from NNPDF import CommonData
 from reportengine.checks import require_one, remove_outer, check_not_empty
@@ -39,7 +42,11 @@
 )
 
 from validphys.n3fit_data_utils import parse_simu_parameters_names_CF
+from validphys.loader import _get_nnpdf_profile
 
+from NNPDF import DataSet, Experiment
+
+import validphys.covmats as vp_covmats
 
 log = logging.getLogger(__name__)
 
@@ -502,7 +509,7 @@ def procs_corrmat(procs_covmat):
     return groups_corrmat(procs_covmat)
 
 
-def results(dataset: (DataSetSpec), pdf: PDF, covariance_matrix, sqrt_covmat, dataset_bsm_factor):
+def results(dataset: (DataSetSpec), pdf: PDF, covariance_matrix, sqrt_covmat, dataset_bsm_factor, use_th_covmat, theoryid):
     
     """Tuple of data and theory results for a single pdf. The data will have an associated
     covariance matrix, which can include a contribution from the theory covariance matrix which
@@ -512,6 +519,34 @@ def results(dataset: (DataSetSpec), pdf: PDF, covariance_matrix, sqrt_covmat, da
     A group of datasets is also allowed.
     (as a result of the C++ code layout)."""
     data = dataset.load()
+
+    if isinstance(data, DataSet):
+        dataset_list = [data]
+    elif isinstance(data, Experiment):
+        dataset_list = [ds for ds in data.DataSets()] 
+ 
+    if use_th_covmat:
+        data_path = str(_get_nnpdf_profile()['data_path']) + 'theory_' + theoryid.id + '/simu_factors'
+        # If using the theory covariance matrix, we must build it here.
+        th_covmats = []
+        for ds in dataset_list:
+            simu_fac_path = data_path + '/SIMU_' + ds.GetSetName() + '.yaml'
+            if os.path.exists(simu_fac_path):
+                with open(simu_fac_path, 'rb') as file:
+                    simu_file = yaml.safe_load(file)
+                if 'theory_cov' in simu_file.keys():
+                    th_covmats += [simu_file['theory_cov']]
+                else:
+                    th_covmats += [np.zeros((ds['ndata'],ds['ndata']))]
+            else:
+                th_covmats += [np.zeros((ds['ndata'],ds['ndata']))]
+
+        th_covmat = sp.linalg.block_diag(*th_covmats)
+
+        if not np.all((th_covmat == 0.0)):
+            covariance_matrix = th_covmat + covariance_matrix
+            sqrt_covmat = vp_covmats.sqrt_covmat(covariance_matrix) 
+
     return (
         DataResult(data, covariance_matrix, sqrt_covmat),
         ThPredictionsResult.from_convolution(pdf, dataset, bsm_factor=dataset_bsm_factor),
@@ -520,11 +555,11 @@ def results(dataset: (DataSetSpec), pdf: PDF, covariance_matrix, sqrt_covmat, da
 
 
 def dataset_inputs_results(
-    data, pdf: PDF, dataset_inputs_covariance_matrix, dataset_inputs_sqrt_covmat, dataset_inputs_bsm_factor
+    data, pdf: PDF, dataset_inputs_covariance_matrix, dataset_inputs_sqrt_covmat, dataset_inputs_bsm_factor, use_th_covmat, theoryid
 ):
     """Like `results` but for a group of datasets"""
     return results(
-        data, pdf, dataset_inputs_covariance_matrix, dataset_inputs_sqrt_covmat, dataset_inputs_bsm_factor
+        data, pdf, dataset_inputs_covariance_matrix, dataset_inputs_sqrt_covmat, dataset_inputs_bsm_factor, use_th_covmat, theoryid
     )
 
 
@@ -552,17 +587,19 @@ def one_or_more_results(
     covariance_matrix,
     sqrt_covmat,
     dataset_bsm_factor,
+    theoryid,
     pdfs: (type(None), Sequence) = None,
     pdf: (type(None), PDF) = None,
+    use_th_covmat=False,
 ):
     """Generate a list of results, where the first element is the data values,
     and the next is either the prediction for pdf or for each of the pdfs.
     Which of the two is selected intelligently depending on the namespace,
     when executing as an action."""
     if pdf:
-        return results(dataset, pdf, covariance_matrix, sqrt_covmat, dataset_bsm_factor)
+        return results(dataset, pdf, covariance_matrix, sqrt_covmat, dataset_bsm_factor, use_th_covmat, theoryid)
     else:
-        return pdf_results(dataset, pdfs, covariance_matrix, sqrt_covmat, dataset_bsm_factor)
+        return pdf_results(dataset, pdfs, covariance_matrix, sqrt_covmat, dataset_bsm_factor, use_th_covmat, theoryid)
     raise ValueError("Either 'pdf' or 'pdfs' is required")
 
 

From b720b69bf1a500ff7bf3216c9824251ca738c6b0 Mon Sep 17 00:00:00 2001
From: James Moore <jmm232@login-p-1.data.cluster>
Date: Wed, 22 Nov 2023 12:27:07 +0000
Subject: [PATCH 3/5] Updated vp-comparefits so that it uses the theory
 covariance matrix, if the fit used it

---
 validphys2/src/validphys/scripts/vp_comparefits.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/validphys2/src/validphys/scripts/vp_comparefits.py b/validphys2/src/validphys/scripts/vp_comparefits.py
index 5d3ab36d1..1cc626a19 100644
--- a/validphys2/src/validphys/scripts/vp_comparefits.py
+++ b/validphys2/src/validphys/scripts/vp_comparefits.py
@@ -232,6 +232,9 @@ def complete_mapping(self):
             },
             'bsm_sector_data': {
                 'from_': 'fit'
+            },
+            'use_th_covmat': {
+                'from_': 'fit'
             }
         }
         refmap = {'id': args['reference_fit'], 'label': args['reference_fit_label']}
@@ -250,6 +253,9 @@ def complete_mapping(self):
             },
             'bsm_sector_data': {
                 'from_': 'fit'
+            },
+            'use_th_covmat': {
+                'from_': 'fit'
             }
         }
         autosettings['use_thcovmat_if_present'] = args['thcovmat_if_present']

From 1c4ca0244e470d1e1bbb8b99ffc026e2b065c6d6 Mon Sep 17 00:00:00 2001
From: James Moore <jmm232@login-p-4.data.cluster>
Date: Wed, 22 Nov 2023 15:11:33 +0000
Subject: [PATCH 4/5] Fixed theory covariance matrix for plots

---
 validphys2/src/validphys/dataplots.py |  6 ++++++
 validphys2/src/validphys/results.py   | 21 ++++++++++++++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py
index af2e7b657..5986ee45f 100644
--- a/validphys2/src/validphys/dataplots.py
+++ b/validphys2/src/validphys/dataplots.py
@@ -270,6 +270,12 @@ def _plot_fancy_impl(results, commondata, cutlist,
             table[('err', i)] = np.abs(err/norm_cv)
         cvcols.append(cvcol)
 
+        print("Central value")
+        print(cv/norm_cv)
+        print("Error")
+        print(err)
+        print(norm_cv)
+        print(err/norm_cv)
 
     figby = sane_groupby_iter(table, info.figure_by)
 
diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py
index 80ed041a1..e860c2048 100644
--- a/validphys2/src/validphys/results.py
+++ b/validphys2/src/validphys/results.py
@@ -56,8 +56,9 @@ class Result:
 
 
 class StatsResult(Result):
-    def __init__(self, stats):
+    def __init__(self, stats, th_covmat=None):
         self.stats = stats
+        self.th_covmat = th_covmat
 
     @property
     def rawdata(self):
@@ -75,7 +76,10 @@ def central_value(self):
 
     @property
     def std_error(self):
-        return self.stats.std_error()
+        err = self.stats.std_error()
+        if self.th_covmat is not None:
+            err = [np.sqrt(err[i]**2 + self.th_covmat[i,i]) for i in range(len(err))]
+        return err
 
     def __len__(self):
         """Returns the number of data points in the result"""
@@ -117,12 +121,12 @@ def sqrtcovmat(self):
 class ThPredictionsResult(StatsResult):
     """Class holding theory prediction, inherits from StatsResult"""
 
-    def __init__(self, dataobj, stats_class, bsm_factor, label=None):
+    def __init__(self, dataobj, stats_class, bsm_factor, label=None, th_covmat=None):
         self.stats_class = stats_class
         self.label = label
         dataobj = pd.DataFrame(dataobj.values * bsm_factor)
         statsobj = stats_class(dataobj.T)
-        super().__init__(statsobj)
+        super().__init__(statsobj, th_covmat)
 
     @staticmethod
     def make_label(pdf, dataset):
@@ -140,7 +144,7 @@ def make_label(pdf, dataset):
         return label
 
     @classmethod
-    def from_convolution(cls, pdf, dataset, bsm_factor):
+    def from_convolution(cls, pdf, dataset, bsm_factor, th_covmat):
         # This should work for both single dataset and whole groups
         try:
             datasets = dataset.datasets
@@ -157,7 +161,7 @@ def from_convolution(cls, pdf, dataset, bsm_factor):
 
         label = cls.make_label(pdf, dataset)
 
-        return cls(th_predictions, pdf.stats_class, bsm_factor, label)
+        return cls(th_predictions, pdf.stats_class, bsm_factor, label, th_covmat)
 
 
 class PositivityResult(StatsResult):
@@ -547,9 +551,12 @@ def results(dataset: (DataSetSpec), pdf: PDF, covariance_matrix, sqrt_covmat, da
             covariance_matrix = th_covmat + covariance_matrix
             sqrt_covmat = vp_covmats.sqrt_covmat(covariance_matrix) 
 
+    else:
+        th_covmat = None
+
     return (
         DataResult(data, covariance_matrix, sqrt_covmat),
-        ThPredictionsResult.from_convolution(pdf, dataset, bsm_factor=dataset_bsm_factor),
+        ThPredictionsResult.from_convolution(pdf, dataset, bsm_factor=dataset_bsm_factor, th_covmat=th_covmat),
     )
 
 

From dfe53b6b62d1765ff674a080f30df659b876aa07 Mon Sep 17 00:00:00 2001
From: James Moore <jmm232@login-p-4.data.cluster>
Date: Wed, 22 Nov 2023 15:12:49 +0000
Subject: [PATCH 5/5] Removed spurious print statements from plots

---
 validphys2/src/validphys/dataplots.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py
index 5986ee45f..44011f3b7 100644
--- a/validphys2/src/validphys/dataplots.py
+++ b/validphys2/src/validphys/dataplots.py
@@ -270,13 +270,6 @@ def _plot_fancy_impl(results, commondata, cutlist,
             table[('err', i)] = np.abs(err/norm_cv)
         cvcols.append(cvcol)
 
-        print("Central value")
-        print(cv/norm_cv)
-        print("Error")
-        print(err)
-        print(norm_cv)
-        print(err/norm_cv)
-
     figby = sane_groupby_iter(table, info.figure_by)