Skip to content

Commit

Permalink
cms-2016-simulated-datasets: add missing lhe file and fix output data…
Browse files Browse the repository at this point in the history
…set format
  • Loading branch information
Kati Lassila-Perini committed Dec 14, 2023
1 parent 0ab2dd8 commit f6a86a7
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 1 deletion.
2 changes: 1 addition & 1 deletion cms-2016-simulated-datasets/code/dataset_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def get_all_generator_text(dataset, das_dir, mcm_dir, conf_dir, recid_info):
process = ''
output_dataset = get_output_dataset_from_mcm(dataset, mcm_step_dir)
if output_dataset:
step['output_dataset'] = output_dataset
step['output_dataset'] = output_dataset[0]
release = get_cmssw_version_from_mcm(dataset, mcm_step_dir)
if release:
step['release'] = release
Expand Down
127 changes: 127 additions & 0 deletions cms-2016-simulated-datasets/code/lhe_generators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
from dataset_records import *
from os import listdir
from os.path import isfile, join
from requests.packages.urllib3.exceptions import InsecureRequestWarning


exec(open('inputs/recid_info.py', 'r').read()) # import RECID_INFO
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# get LHE Parent or False
def get_lhe(dataset, mcm_dir):
path = mcm_dir + '/chain/' + dataset.replace('/', '@')
step_dirs = os.listdir(path)
for step in step_dirs:
step_dir = path + '/' + step
datatier = get_from_deep_json(get_mcm_dict(dataset,step_dir),'datatier')
if "LHE" in datatier:
return step_dir

return False


def cmd_run(cmds, dataset):
for cmd in cmds:
err = subprocess.run(cmd, shell=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE).stderr.decode()
if err:
print("<pserr>\n[Error] in " + dataset + "\n==>\t" +
err + "<==\n</pserr>", file=sys.stderr)
return False
return True


def create_lhe_generator(dataset, recid, mcm_dir, gen_store='./lhe_generators/2016-sim'):
# mcm_dir is the directory of the LHE step
fragment_url = get_genfragment_url(dataset, mcm_dir)
if fragment_url:
fragment_url = fragment_url[0]
fragment = requests.get(fragment_url, verify=False).text
if not fragment:
fragment = get_from_deep_json(
get_mcm_dict(dataset, mcm_dir), "fragment")
else:
fragment = get_from_deep_json(
get_mcm_dict(dataset, mcm_dir), "fragment")
if not fragment:
print("<emp>\n[Error] in" + dataset +
"\n==>\t No fragment URL and Empty fragment in mcm dict, Skipping\n</emp>", file=sys.stderr)
return

path = re.search(r"cms.vstring\('(.*?)'", fragment)

if not path:
print("<vstring>\n[Warning] in" + dataset +
"\n==>\t 'cms.vstring' not found in fragment , Skipping\n</vstring>", file=sys.stderr)
return
path = path.group(1)
# print("found path: " + str(path) )
outfilepath = "{gen_store}/gridpacks/{recid}".format(
gen_store=gen_store, recid=recid)

if os.path.exists(outfilepath) and len(os.listdir(outfilepath)) != 0:
print(str(recid) + ' recid gridpack Exist, Skipping')
return

if 'amcatnlo' in path or 'amcatnlo' in dataset:
print(dataset + '\n' + str(recid) +
"amcatnlo gridpack!!! path:" + path)
files = [
'process/Cards/run_card.dat',
'process/Cards/proc_card*.dat',
'process/Cards/param_card.dat',
]
mv_cmd = "mv process/Cards/*dat .; rmdir -p process/Cards"
elif 'madgraph' in path:
files = [
'process/madevent/Cards/run_card.dat',
'process/madevent/Cards/proc_card*.dat',
'process/madevent/Cards/param_card.dat',
]
mv_cmd = "mv process/madevent/Cards/*dat .; rmdir -p process/madevent/Cards"
elif 'powheg' in path:
files = [
'*.input',
]
mv_cmd = ""
else:
print("<path>\n[Error] Unknown path:('" + path +
"')\nDataset: " + dataset + '\n</path>', file=sys.stderr)
return

files = "'" + "' '".join(files) + "'"
cmds = [
"mkdir -p {out}; cd {out};\
tar -xf {path} {files} -C {out}; {mv}".format(out=outfilepath, path=path, files=files, mv=mv_cmd)
]
# print("Prepared commands: " + str(cmds))
cmd_run(cmds, dataset)


das_dir = "./inputs/das-json-store"
mcm_dir = "./inputs/mcm-store"
with open("./inputs/CMS-2016-mc-datasets.txt", 'r') as file:
dataset_full_names = file.readlines()

dataset_nanoaod = [name[:-1] for name in dataset_full_names if name[:-1].endswith('NANOAODSIM')]
i = 1
l = len(dataset_nanoaod)
for dataset in dataset_nanoaod:

#dataset = dataset[:-1]

lhe_dir = get_lhe(dataset, mcm_dir)
if not lhe_dir:
continue

recid = RECID_INFO[dataset]

print("Getting ({i}/{l}): {ds}".format(
i=i, l=l, ds=lhe_dir or 'No LHE parent for this record'))

t = threading.Thread(target=create_lhe_generator,
args=(dataset, recid, lhe_dir))
t.start()
i += 1
while threading.activeCount() >= 20:
sleep(0.5) # run 20 parallel

0 comments on commit f6a86a7

Please sign in to comment.