From 600d93136dd8572f22812c819b4c5d88fff7ec23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tibor=20=C5=A0imko?= <tibor.simko@cern.ch>
Date: Thu, 18 Jan 2024 11:33:22 +0100
Subject: [PATCH] WIP

---
 cms-2016-simulated-datasets/README.md         |  16 +-
 .../code/lhe_generators.py                    | 288 +++++++++++++-----
 2 files changed, 213 insertions(+), 91 deletions(-)

diff --git a/cms-2016-simulated-datasets/README.md b/cms-2016-simulated-datasets/README.md
index 4f4f3007d..bde90a0b2 100644
--- a/cms-2016-simulated-datasets/README.md
+++ b/cms-2016-simulated-datasets/README.md
@@ -43,7 +43,7 @@ Warning: Creating the full local cache might take a long time.
 First step is to create EOS file index cache:
 
 ```console
-$ time python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt
+$ python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt
 ```
 
 This requires the data files to be placed in their final location. However, for
@@ -53,17 +53,17 @@ by means of adding the command-line option `--ignore-eos-store` to the commands
 We can now build sample records by doing:
 
 ```console
-$ time python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
+$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
 
 $ auth-get-sso-cookie -u  https://cms-pdmv.cern.ch/mcm -o cookies.txt
-$ time python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
+$ python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
 
-$ time python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
+$ python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
 
-$ time python3 code/lhe_generators.py
+$ python3 code/lhe_generators.py
 
-$ time python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
-$ time python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
+$ python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
+$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt
 ```
 
 Note that to build the test records an (empty) input file for DOIs and a recid
@@ -80,7 +80,7 @@ The output JSON files for the dataset records will be generated in the
 
 
 ```console
-python3 code/lhe_generators.py 2> errors > output &
+$ python3 code/lhe_generators.py >& output
 ```
 
 - This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt`.
diff --git a/cms-2016-simulated-datasets/code/lhe_generators.py b/cms-2016-simulated-datasets/code/lhe_generators.py
index 745c5315c..5daed4ba6 100644
--- a/cms-2016-simulated-datasets/code/lhe_generators.py
+++ b/cms-2016-simulated-datasets/code/lhe_generators.py
@@ -1,126 +1,248 @@
+#!/usr/bin/env python3
+
+import datetime
+import fnmatch
+import os
+import re
+import requests
+import subprocess
+import urllib3
+
 from dataset_records import *
-from os import listdir
-from os.path import isfile, join
-from requests.packages.urllib3.exceptions import InsecureRequestWarning
+from mcm_store import get_mcm_dict
+from utils import get_from_deep_json
+
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+RECID_INFO = {}
+exec(open("inputs/recid_info.py", "r").read())  # import RECID_INFO
 
 
-exec(open('inputs/recid_info.py', 'r').read())  # import RECID_INFO
-requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+def log(recid, logtype, logmessage):
+    """Store a log message of a certain type to record-ID-based log file system."""
+    logdir = f"./lhe_generators/2016-sim/gridpacks/{recid}"
+    if not os.path.exists(logdir):
+        os.makedirs(logdir)
+    with open(f"{logdir}/LOG.txt", "a") as fdesc:
+        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        fdesc.write(f"{now} | {logtype} | {logmessage}\n")
+
 
-# get LHE Parent or False
 def get_lhe(dataset, mcm_dir):
-    path = mcm_dir + '/chain/' + dataset.replace('/', '@')
+    """Get LHE Parent or False"""
+    path = mcm_dir + "/chain/" + dataset.replace("/", "@")
     step_dirs = os.listdir(path)
     for step in step_dirs:
-        step_dir = path + '/' + step
-        datatier = get_from_deep_json(get_mcm_dict(dataset,step_dir),'datatier')
-        if "LHE" in datatier:       
+        step_dir = path + "/" + step
+        datatier = get_from_deep_json(get_mcm_dict(dataset, step_dir), "datatier")
+        if "LHE" in datatier:
             return step_dir
 
     return False
 
 
-def cmd_run(cmds, dataset):
+def cmd_run(cmds, recid):
     for cmd in cmds:
-        err = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE,
-                             stdout=subprocess.PIPE).stderr.decode()
+        err = subprocess.run(
+            cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE
+        ).stderr.decode()
         if err:
-            print("<pserr>\n[Error] in " + dataset + "\n==>\t" +
-                  err + "<==\n</pserr>", file=sys.stderr)
+            log(recid, "ERROR", f"Error {err}")
             return False
     return True
 
 
-def create_lhe_generator(dataset, recid, mcm_dir, gen_store='./lhe_generators/2016-sim'):
-# mcm_dir is the directory of the LHE step
-        fragment_url = get_genfragment_url(dataset, mcm_dir)
-        if fragment_url:
-            fragment_url = fragment_url[0]
-            fragment = requests.get(fragment_url, verify=False).text
-            if not fragment:
-                fragment = get_from_deep_json(
-                    get_mcm_dict(dataset, mcm_dir), "fragment")
-        else:
-            fragment = get_from_deep_json(
-                get_mcm_dict(dataset, mcm_dir), "fragment")
+def create_lhe_generator(
+    dataset, recid, mcm_dir, gen_store="./lhe_generators/2016-sim"
+):
+    # mcm_dir is the directory of the LHE step
+    mcdb_id = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "mcdb_id") or 0
+    if mcdb_id > 0:
+        log(recid, "WARNING", f"Skipping because of mcdb_id value {mcdb_id}")
+        return
+
+    # Find fragment
+    fragment_url = get_genfragment_url(dataset, mcm_dir)
+    if fragment_url:
+        fragment_url = fragment_url[0]
+        fragment = requests.get(fragment_url, verify=False).text
         if not fragment:
-            print("<emp>\n[Error] in" + dataset +
-                  "\n==>\t No fragment URL and Empty fragment in mcm dict, Skipping\n</emp>", file=sys.stderr)
-            return
-
-        path = re.search(r"cms.vstring\('(.*?)'", fragment)
- 
-        if not path:
-            print("<vstring>\n[Warning] in" + dataset +
-                  "\n==>\t 'cms.vstring' not found in fragment , Skipping\n</vstring>", file=sys.stderr)
-            return
-        path = path.group(1)
-        # print("found path: " + str(path) )
-        outfilepath = "{gen_store}/gridpacks/{recid}".format(
-            gen_store=gen_store, recid=recid)
-
-        if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) != 0:
-            print(str(recid) + ' recid gridpack Exist, Skipping')
-            return
-
-        if 'amcatnlo' in path or 'amcatnlo' in dataset:
-            print(dataset + '\n' + str(recid) +
-                  "amcatnlo gridpack!!! path:" + path)
-            files = [
-                'process/Cards/run_card.dat',
-                'process/Cards/proc_card*.dat',
-                'process/Cards/param_card.dat',
+            fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment")
+    else:
+        fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment")
+    if not fragment:
+        log(
+            recid,
+            "ERROR",
+            f"No fragment URL and Empty fragment in mcm dict; skipping.",
+        )
+        return
+
+    # Find gridpack path
+    path = re.search(r"cms.vstring\('(/cvmfs.*?)'", fragment)
+    if not path:
+        log(
+            recid,
+            "ERROR",
+            f"No 'cms.vstring(/cvmfs' found in fragment; skipping.",
+        )
+        return
+
+    path = path.group(1)
+    log(recid, "INFO", f"Found path {path}")
+    outfilepath = "{gen_store}/gridpacks/{recid}".format(
+        gen_store=gen_store, recid=recid
+    )
+    if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) > 1:
+        log(
+            recid,
+            "WARNING",
+            f"Gridpack seems to exist for this record ID already. Skipping.",
+        )
+        return
+
+    # Identify gridpack case
+    gridpack_case = "UNKNOWN"
+    path_lower = path.lower()
+    path_lower_position = {}
+    for acase in ["amcatnlo", "madgraph", "powheg", "jhugen", "phantom", "mcfm"]:
+        path_lower_position[acase] = path_lower.find(acase)
+    found = 1e10
+    for key, val in path_lower_position.items():
+        if val > 0 and val < found:
+            gridpack_case = key
+    if gridpack_case == "UNKNOWN":
+        log(recid, "ERROR", f"Found case {gridpack_case}")
+    else:
+        log(recid, "INFO", f"Found case {gridpack_case}")
+
+    # List content if all files in gridpack tarball
+    files_all = []
+    res = subprocess.check_output(f"tar tf {path}", shell=True)
+    for line in res.splitlines():
+        files_all.append(line.decode())
+
+    # Select interesting files based on gridpack case
+    files = [
+        "./InputCards/*.dat",
+        "./runcmsgrid.sh",
+        "InputCards/*.dat",
+        "runcmsgrid.sh",
+    ]
+    if gridpack_case == "amcatnlo":
+        files.extend(
+            [
+                "./process/Cards/param_card.dat",
+                "./process/Cards/proc_card*.dat",
+                "./process/Cards/run_card.dat",
+                "process/Cards/param_card.dat",
+                "process/Cards/proc_card*.dat",
+                "process/Cards/run_card.dat",
             ]
-            mv_cmd = "mv process/Cards/*dat .; rmdir -p process/Cards"
-        elif 'madgraph' in path:
-            files = [
-                'process/madevent/Cards/run_card.dat',
-                'process/madevent/Cards/proc_card*.dat',
-                'process/madevent/Cards/param_card.dat',
+        )
+    elif gridpack_case == "madgraph":
+        files.extend(
+            [
+                "./process/madevent/Cards/param_card.dat",
+                "./process/madevent/Cards/proc_card*.dat",
+                "./process/madevent/Cards/run_card.dat",
+                "process/madevent/Cards/param_card.dat",
+                "process/madevent/Cards/proc_card*.dat",
+                "process/madevent/Cards/run_card.dat",
             ]
-            mv_cmd = "mv process/madevent/Cards/*dat .; rmdir -p process/madevent/Cards"
-        elif 'powheg' in path:
-            files = [
-                '*.input',
+        )
+    elif gridpack_case == "powheg":
+        files.extend(
+            [
+                "*.input",
             ]
-            mv_cmd = ""
-        else:
-            print("<path>\n[Error] Unknown path:('" + path +
-                  "')\nDataset: " + dataset + '\n</path>', file=sys.stderr)
-            return
-
-        files = "'" + "' '".join(files) + "'"
+        )
+    elif gridpack_case == "jhugen":
+        files.extend(
+            [
+                "./jhugen.input",
+                "./jhugen_decay.input",
+                "jhugen.input",
+                "jhugen_decay.input",
+            ]
+        )
+    elif gridpack_case == "phantom":
+        files.extend(
+            [
+                "./r_GEN.in",
+                "r_GEN.in",
+            ]
+        )
+    elif gridpack_case == "mcfm":
+        files.extend(
+            [
+                "./readInput.DAT",
+                "readInput.DAT",
+            ]
+        )
+
+    # Select only those files that are present
+    files_selected = []
+    for afile in files:
+        files_selected.extend(fnmatch.filter(files_all, afile))
+
+    # Warn if there was no runcmsgrid or InputCards found for some cases
+    if gridpack_case in ("amcatnlo", "madgraph"):
+        if not "InputCards" in " ".join(files_selected):
+            log(recid, "ERROR", f"InputCards not present in the tarball.")
+        if not "runcmsgrid.sh" in " ".join(files_selected):
+            log(recid, "ERROR", f"runcmsgrid.sh not present in the tarball.")
+
+    # Warn if no interesting files were found at all
+    if len(files_selected) == 0:
+        log(recid, "ERROR", "Found no interesting files at all.")
+    else:
+        # Inform about which files are going to be extracted
+        log(
+            recid,
+            "INFO",
+            f"Found the following interesting files: {' '.join(files_selected)}",
+        )
+        # Prepare the tarball extraction command
         cmds = [
-            "mkdir -p {out}; cd {out};\
-            tar -xf {path} {files} -C {out}; {mv}".format(out=outfilepath, path=path, files=files, mv=mv_cmd)
+            f"mkdir -p {outfilepath}; cd {outfilepath}; tar -xf {path} {' '.join(files_selected)} -C {outfilepath}"
         ]
-        # print("Prepared commands: " + str(cmds))
-        cmd_run(cmds, dataset)
+        log(recid, "INFO", f"Executing commands {cmds}")
+        # Run the tarball extraction command
+        cmd_run(cmds, recid)
+
+    # Print full content of gridpack tarball for debugging purposes
+    log(recid, "DEBUG", f"Full gridpack tarball content is:")
+    for afile in files_all:
+        log(recid, "DEBUG", f"- {afile}")
 
 
 das_dir = "./inputs/das-json-store"
 mcm_dir = "./inputs/mcm-store"
-with open("./inputs/CMS-2016-mc-datasets.txt", 'r') as file:
+with open("./inputs/CMS-2016-mc-datasets.txt", "r") as file:
     dataset_full_names = file.readlines()
 
-dataset_nanoaod = [name[:-1] for name in dataset_full_names if name[:-1].endswith('NANOAODSIM')]
+dataset_nanoaod = [
+    name[:-1] for name in dataset_full_names if name[:-1].endswith("NANOAODSIM")
+]
 i = 1
 l = len(dataset_nanoaod)
 for dataset in dataset_nanoaod:
+    recid = RECID_INFO[dataset]
 
-    #dataset = dataset[:-1]
+    print(f"Getting LHE {i}/{l}")
+    log(recid, "INFO", f"Getting LHE {i}/{l}")
+    log(recid, "INFO", f"Found record ID {recid}")
+    log(recid, "INFO", f"Found dataset {dataset}")
 
     lhe_dir = get_lhe(dataset, mcm_dir)
     if not lhe_dir:
+        log(recid, "ERROR", f"There is no LHE directory. Skipping.")
         continue
 
-    recid = RECID_INFO[dataset]
-
-    print("Getting ({i}/{l}): {ds}".format(
-        i=i, l=l, ds=lhe_dir or 'No LHE parent for this record'))
+    log(recid, "INFO", f"Found LHE directory {lhe_dir}")
 
-    t = threading.Thread(target=create_lhe_generator,
-                         args=(dataset, recid, lhe_dir))
+    t = threading.Thread(target=create_lhe_generator, args=(dataset, recid, lhe_dir))
     t.start()
     i += 1
     while threading.activeCount() >= 20: