diff --git a/cms-2016-simulated-datasets/code/dataset_records.py b/cms-2016-simulated-datasets/code/dataset_records.py index dd3a6396c..b2cf12ce0 100644 --- a/cms-2016-simulated-datasets/code/dataset_records.py +++ b/cms-2016-simulated-datasets/code/dataset_records.py @@ -187,7 +187,7 @@ def get_all_generator_text(dataset, das_dir, mcm_dir, conf_dir, recid_info): process = '' output_dataset = get_output_dataset_from_mcm(dataset, mcm_step_dir) if output_dataset: - step['output_dataset'] = output_dataset + step['output_dataset'] = output_dataset[0] release = get_cmssw_version_from_mcm(dataset, mcm_step_dir) if release: step['release'] = release diff --git a/cms-2016-simulated-datasets/code/lhe_generators.py b/cms-2016-simulated-datasets/code/lhe_generators.py new file mode 100644 index 000000000..745c5315c --- /dev/null +++ b/cms-2016-simulated-datasets/code/lhe_generators.py @@ -0,0 +1,127 @@ +from dataset_records import * +from os import listdir +from os.path import isfile, join +from requests.packages.urllib3.exceptions import InsecureRequestWarning + + +exec(open('inputs/recid_info.py', 'r').read()) # import RECID_INFO +requests.packages.urllib3.disable_warnings(InsecureRequestWarning) + +# get LHE Parent or False +def get_lhe(dataset, mcm_dir): + path = mcm_dir + '/chain/' + dataset.replace('/', '@') + step_dirs = os.listdir(path) + for step in step_dirs: + step_dir = path + '/' + step + datatier = get_from_deep_json(get_mcm_dict(dataset,step_dir),'datatier') + if "LHE" in datatier: + return step_dir + + return False + + +def cmd_run(cmds, dataset): + for cmd in cmds: + err = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE, + stdout=subprocess.PIPE).stderr.decode() + if err: + print("\n[Error] in " + dataset + "\n==>\t" + + err + "<==\n", file=sys.stderr) + return False + return True + + +def create_lhe_generator(dataset, recid, mcm_dir, gen_store='./lhe_generators/2016-sim'): +# mcm_dir is the directory of the LHE step + fragment_url = get_genfragment_url(dataset, mcm_dir) + if fragment_url: + fragment_url = fragment_url[0] + fragment = requests.get(fragment_url, verify=False).text + if not fragment: + fragment = get_from_deep_json( + get_mcm_dict(dataset, mcm_dir), "fragment") + else: + fragment = get_from_deep_json( + get_mcm_dict(dataset, mcm_dir), "fragment") + if not fragment: + print("\n[Error] in" + dataset + + "\n==>\t No fragment URL and Empty fragment in mcm dict, Skipping\n", file=sys.stderr) + return + + path = re.search(r"cms.vstring\('(.*?)'", fragment) + + if not path: + print("\n[Warning] in" + dataset + + "\n==>\t 'cms.vstring' not found in fragment , Skipping\n", file=sys.stderr) + return + path = path.group(1) + # print("found path: " + str(path) ) + outfilepath = "{gen_store}/gridpacks/{recid}".format( + gen_store=gen_store, recid=recid) + + if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) != 0: + print(str(recid) + ' recid gridpack Exist, Skipping') + return + + if 'amcatnlo' in path or 'amcatnlo' in dataset: + print(dataset + '\n' + str(recid) + + "amcatnlo gridpack!!! path:" + path) + files = [ + 'process/Cards/run_card.dat', + 'process/Cards/proc_card*.dat', + 'process/Cards/param_card.dat', + ] + mv_cmd = "mv process/Cards/*dat .; rmdir -p process/Cards" + elif 'madgraph' in path: + files = [ + 'process/madevent/Cards/run_card.dat', + 'process/madevent/Cards/proc_card*.dat', + 'process/madevent/Cards/param_card.dat', + ] + mv_cmd = "mv process/madevent/Cards/*dat .; rmdir -p process/madevent/Cards" + elif 'powheg' in path: + files = [ + '*.input', + ] + mv_cmd = "" + else: + print("\n[Error] Unknown path:('" + path + + "')\nDataset: " + dataset + '\n', file=sys.stderr) + return + + files = "'" + "' '".join(files) + "'" + cmds = [ + "mkdir -p {out}; cd {out};\ + tar -xf {path} {files} -C {out}; {mv}".format(out=outfilepath, path=path, files=files, mv=mv_cmd) + ] + # print("Prepared commands: " + str(cmds)) + cmd_run(cmds, dataset) + + +das_dir = "./inputs/das-json-store" +mcm_dir = "./inputs/mcm-store" +with open("./inputs/CMS-2016-mc-datasets.txt", 'r') as file: + dataset_full_names = file.readlines() + +dataset_nanoaod = [name[:-1] for name in dataset_full_names if name[:-1].endswith('NANOAODSIM')] +i = 1 +l = len(dataset_nanoaod) +for dataset in dataset_nanoaod: + + #dataset = dataset[:-1] + + lhe_dir = get_lhe(dataset, mcm_dir) + if not lhe_dir: + continue + + recid = RECID_INFO[dataset] + + print("Getting ({i}/{l}): {ds}".format( + i=i, l=l, ds=lhe_dir or 'No LHE parent for this record')) + + t = threading.Thread(target=create_lhe_generator, + args=(dataset, recid, lhe_dir)) + t.start() + i += 1 + while threading.activeCount() >= 20: + sleep(0.5) # run 20 parallel