From 600d93136dd8572f22812c819b4c5d88fff7ec23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tibor=20=C5=A0imko?= Date: Thu, 18 Jan 2024 11:33:22 +0100 Subject: [PATCH] WIP --- cms-2016-simulated-datasets/README.md | 16 +- .../code/lhe_generators.py | 288 +++++++++++++----- 2 files changed, 213 insertions(+), 91 deletions(-) diff --git a/cms-2016-simulated-datasets/README.md b/cms-2016-simulated-datasets/README.md index 4f4f3007d..bde90a0b2 100644 --- a/cms-2016-simulated-datasets/README.md +++ b/cms-2016-simulated-datasets/README.md @@ -43,7 +43,7 @@ Warning: Creating the full local cache might take a long time. First step is to create EOS file index cache: ```console -$ time python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt +$ python3 ./code/interface.py --create-eos-indexes inputs/CMS-2016-mc-datasets.txt ``` This requires the data files to be placed in their final location. However, for @@ -53,17 +53,17 @@ by means of adding the command-line option `--ignore-eos-store` to the commands We can now build sample records by doing: ```console -$ time python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt +$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt $ auth-get-sso-cookie -u https://cms-pdmv.cern.ch/mcm -o cookies.txt -$ time python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt +$ python3 ./code/interface.py --create-mcm-store --ignore-eos-store inputs/CMS-2016-mc-datasets.txt -$ time python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt +$ python3 ./code/interface.py --get-conf-files --ignore-eos-store inputs/CMS-2016-mc-datasets.txt -$ time python3 code/lhe_generators.py +$ python3 code/lhe_generators.py -$ time python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt -$ time python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt +$ python3 ./code/interface.py --create-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt +$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store inputs/CMS-2016-mc-datasets.txt ``` Note that to build the test records an (empty) input file for DOIs and a recid @@ -80,7 +80,7 @@ The output JSON files for the dataset records will be generated in the ```console -python3 code/lhe_generators.py 2> errors > output & +$ python3 code/lhe_generators.py >& output ``` - This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt`. diff --git a/cms-2016-simulated-datasets/code/lhe_generators.py b/cms-2016-simulated-datasets/code/lhe_generators.py index 745c5315c..5daed4ba6 100644 --- a/cms-2016-simulated-datasets/code/lhe_generators.py +++ b/cms-2016-simulated-datasets/code/lhe_generators.py @@ -1,126 +1,248 @@ +#!/usr/bin/env python3 + +import datetime +import fnmatch +import os +import re +import requests +import subprocess +import urllib3 + from dataset_records import * -from os import listdir -from os.path import isfile, join -from requests.packages.urllib3.exceptions import InsecureRequestWarning +from mcm_store import get_mcm_dict +from utils import get_from_deep_json + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +RECID_INFO = {} +exec(open("inputs/recid_info.py", "r").read()) # import RECID_INFO -exec(open('inputs/recid_info.py', 'r').read()) # import RECID_INFO -requests.packages.urllib3.disable_warnings(InsecureRequestWarning) +def log(recid, logtype, logmessage): + """Store a log message of a certain type to record-ID-based log file system.""" + logdir = f"./lhe_generators/2016-sim/gridpacks/{recid}" + if not os.path.exists(logdir): + os.makedirs(logdir) + with open(f"{logdir}/LOG.txt", "a") as fdesc: + now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + fdesc.write(f"{now} | {logtype} | {logmessage}\n") + -# get LHE Parent or False def get_lhe(dataset, mcm_dir): - path = mcm_dir + '/chain/' + dataset.replace('/', '@') + """Get LHE Parent or False""" + path = mcm_dir + "/chain/" + dataset.replace("/", "@") step_dirs = os.listdir(path) for step in step_dirs: - step_dir = path + '/' + step - datatier = get_from_deep_json(get_mcm_dict(dataset,step_dir),'datatier') - if "LHE" in datatier: + step_dir = path + "/" + step + datatier = get_from_deep_json(get_mcm_dict(dataset, step_dir), "datatier") + if "LHE" in datatier: return step_dir return False -def cmd_run(cmds, dataset): +def cmd_run(cmds, recid): for cmd in cmds: - err = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE, - stdout=subprocess.PIPE).stderr.decode() + err = subprocess.run( + cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE + ).stderr.decode() if err: - print("\n[Error] in " + dataset + "\n==>\t" + - err + "<==\n", file=sys.stderr) + log(recid, "ERROR", f"Error {err}") return False return True -def create_lhe_generator(dataset, recid, mcm_dir, gen_store='./lhe_generators/2016-sim'): -# mcm_dir is the directory of the LHE step - fragment_url = get_genfragment_url(dataset, mcm_dir) - if fragment_url: - fragment_url = fragment_url[0] - fragment = requests.get(fragment_url, verify=False).text - if not fragment: - fragment = get_from_deep_json( - get_mcm_dict(dataset, mcm_dir), "fragment") - else: - fragment = get_from_deep_json( - get_mcm_dict(dataset, mcm_dir), "fragment") +def create_lhe_generator( + dataset, recid, mcm_dir, gen_store="./lhe_generators/2016-sim" +): + # mcm_dir is the directory of the LHE step + mcdb_id = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "mcdb_id") or 0 + if mcdb_id > 0: + log(recid, "WARNING", f"Skipping because of mcdb_id value {mcdb_id}") + return + + # Find fragment + fragment_url = get_genfragment_url(dataset, mcm_dir) + if fragment_url: + fragment_url = fragment_url[0] + fragment = requests.get(fragment_url, verify=False).text if not fragment: - print("\n[Error] in" + dataset + - "\n==>\t No fragment URL and Empty fragment in mcm dict, Skipping\n", file=sys.stderr) - return - - path = re.search(r"cms.vstring\('(.*?)'", fragment) - - if not path: - print("\n[Warning] in" + dataset + - "\n==>\t 'cms.vstring' not found in fragment , Skipping\n", file=sys.stderr) - return - path = path.group(1) - # print("found path: " + str(path) ) - outfilepath = "{gen_store}/gridpacks/{recid}".format( - gen_store=gen_store, recid=recid) - - if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) != 0: - print(str(recid) + ' recid gridpack Exist, Skipping') - return - - if 'amcatnlo' in path or 'amcatnlo' in dataset: - print(dataset + '\n' + str(recid) + - "amcatnlo gridpack!!! path:" + path) - files = [ - 'process/Cards/run_card.dat', - 'process/Cards/proc_card*.dat', - 'process/Cards/param_card.dat', + fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment") + else: + fragment = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), "fragment") + if not fragment: + log( + recid, + "ERROR", + f"No fragment URL and Empty fragment in mcm dict; skipping.", + ) + return + + # Find gridpack path + path = re.search(r"cms.vstring\('(/cvmfs.*?)'", fragment) + if not path: + log( + recid, + "ERROR", + f"No 'cms.vstring(/cvmfs' found in fragment; skipping.", + ) + return + + path = path.group(1) + log(recid, "INFO", f"Found path {path}") + outfilepath = "{gen_store}/gridpacks/{recid}".format( + gen_store=gen_store, recid=recid + ) + if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) > 1: + log( + recid, + "WARNING", + f"Gridpack seems to exist for this record ID already. Skipping.", + ) + return + + # Identify gridpack case + gridpack_case = "UNKNOWN" + path_lower = path.lower() + path_lower_position = {} + for acase in ["amcatnlo", "madgraph", "powheg", "jhugen", "phantom", "mcfm"]: + path_lower_position[acase] = path_lower.find(acase) + found = 1e10 + for key, val in path_lower_position.items(): + if val > 0 and val < found: + gridpack_case = key + if gridpack_case == "UNKNOWN": + log(recid, "ERROR", f"Found case {gridpack_case}") + else: + log(recid, "INFO", f"Found case {gridpack_case}") + + # List content if all files in gridpack tarball + files_all = [] + res = subprocess.check_output(f"tar tf {path}", shell=True) + for line in res.splitlines(): + files_all.append(line.decode()) + + # Select interesting files based on gridpack case + files = [ + "./InputCards/*.dat", + "./runcmsgrid.sh", + "InputCards/*.dat", + "runcmsgrid.sh", + ] + if gridpack_case == "amcatnlo": + files.extend( + [ + "./process/Cards/param_card.dat", + "./process/Cards/proc_card*.dat", + "./process/Cards/run_card.dat", + "process/Cards/param_card.dat", + "process/Cards/proc_card*.dat", + "process/Cards/run_card.dat", ] - mv_cmd = "mv process/Cards/*dat .; rmdir -p process/Cards" - elif 'madgraph' in path: - files = [ - 'process/madevent/Cards/run_card.dat', - 'process/madevent/Cards/proc_card*.dat', - 'process/madevent/Cards/param_card.dat', + ) + elif gridpack_case == "madgraph": + files.extend( + [ + "./process/madevent/Cards/param_card.dat", + "./process/madevent/Cards/proc_card*.dat", + "./process/madevent/Cards/run_card.dat", + "process/madevent/Cards/param_card.dat", + "process/madevent/Cards/proc_card*.dat", + "process/madevent/Cards/run_card.dat", ] - mv_cmd = "mv process/madevent/Cards/*dat .; rmdir -p process/madevent/Cards" - elif 'powheg' in path: - files = [ - '*.input', + ) + elif gridpack_case == "powheg": + files.extend( + [ + "*.input", ] - mv_cmd = "" - else: - print("\n[Error] Unknown path:('" + path + - "')\nDataset: " + dataset + '\n', file=sys.stderr) - return - - files = "'" + "' '".join(files) + "'" + ) + elif gridpack_case == "jhugen": + files.extend( + [ + "./jhugen.input", + "./jhugen_decay.input", + "jhugen.input", + "jhugen_decay.input", + ] + ) + elif gridpack_case == "phantom": + files.extend( + [ + "./r_GEN.in", + "r_GEN.in", + ] + ) + elif gridpack_case == "mcfm": + files.extend( + [ + "./readInput.DAT", + "readInput.DAT", + ] + ) + + # Select only those files that are present + files_selected = [] + for afile in files: + files_selected.extend(fnmatch.filter(files_all, afile)) + + # Warn if there was no runcmsgrid or InputCards found for some cases + if gridpack_case in ("amcatnlo", "madgraph"): + if not "InputCards" in " ".join(files_selected): + log(recid, "ERROR", f"InputCards not present in the tarball.") + if not "runcmsgrid.sh" in " ".join(files_selected): + log(recid, "ERROR", f"runcmsgrid.sh not present in the tarball.") + + # Warn if no interesting files were found at all + if len(files_selected) == 0: + log(recid, "ERROR", "Found no interesting files at all.") + else: + # Inform about which files are going to be extracted + log( + recid, + "INFO", + f"Found the following interesting files: {' '.join(files_selected)}", + ) + # Prepare the tarball extraction command cmds = [ - "mkdir -p {out}; cd {out};\ - tar -xf {path} {files} -C {out}; {mv}".format(out=outfilepath, path=path, files=files, mv=mv_cmd) + f"mkdir -p {outfilepath}; cd {outfilepath}; tar -xf {path} {' '.join(files_selected)} -C {outfilepath}" ] - # print("Prepared commands: " + str(cmds)) - cmd_run(cmds, dataset) + log(recid, "INFO", f"Executing commands {cmds}") + # Run the tarball extraction command + cmd_run(cmds, recid) + + # Print full content of gridpack tarball for debugging purposes + log(recid, "DEBUG", f"Full gridpack tarball content is:") + for afile in files_all: + log(recid, "DEBUG", f"- {afile}") das_dir = "./inputs/das-json-store" mcm_dir = "./inputs/mcm-store" -with open("./inputs/CMS-2016-mc-datasets.txt", 'r') as file: +with open("./inputs/CMS-2016-mc-datasets.txt", "r") as file: dataset_full_names = file.readlines() -dataset_nanoaod = [name[:-1] for name in dataset_full_names if name[:-1].endswith('NANOAODSIM')] +dataset_nanoaod = [ + name[:-1] for name in dataset_full_names if name[:-1].endswith("NANOAODSIM") +] i = 1 l = len(dataset_nanoaod) for dataset in dataset_nanoaod: + recid = RECID_INFO[dataset] - #dataset = dataset[:-1] + print(f"Getting LHE {i}/{l}") + log(recid, "INFO", f"Getting LHE {i}/{l}") + log(recid, "INFO", f"Found record ID {recid}") + log(recid, "INFO", f"Found dataset {dataset}") lhe_dir = get_lhe(dataset, mcm_dir) if not lhe_dir: + log(recid, "ERROR", f"There is no LHE directory. Skipping.") continue - recid = RECID_INFO[dataset] - - print("Getting ({i}/{l}): {ds}".format( - i=i, l=l, ds=lhe_dir or 'No LHE parent for this record')) + log(recid, "INFO", f"Found LHE directory {lhe_dir}") - t = threading.Thread(target=create_lhe_generator, - args=(dataset, recid, lhe_dir)) + t = threading.Thread(target=create_lhe_generator, args=(dataset, recid, lhe_dir)) t.start() i += 1 while threading.activeCount() >= 20: