Skip to content

Commit

Permalink
cms-2016-simulated-datasets: add a mini-nano cache, get reprocessing …
Browse files Browse the repository at this point in the history
…year
  • Loading branch information
Kati Lassila-Perini committed Oct 26, 2023
1 parent f26ab08 commit 996f971
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 35 deletions.
62 changes: 27 additions & 35 deletions cms-2016-simulated-datasets/code/dataset_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
get_dataset_location)
from mcm_store import (get_cmsDriver_script, get_cmssw_version_from_mcm,
get_conffile_ids_from_mcm, get_dataset_energy,
get_data_processing_year,
get_generator_name, get_generator_parameters_from_mcm,
get_genfragment_url, get_global_tag, get_mcm_dict,
get_parent_dataset_from_mcm, get_pileup_from_mcm,
Expand All @@ -47,6 +48,8 @@

CONTAINERIMAGES_CACHE = {}

MININANORELATION_CACHE = {}

def get_number_events(dataset, das_dir):
"""Return number of events for the dataset."""
number_events = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'nevents')
Expand Down Expand Up @@ -164,23 +167,12 @@ def get_globaltag_from_conffile(afile, conf_dir):
return globaltag


#def get_all_generator_text(dataset, das_dir, mcm_dir, conf_dir, recid):
def get_all_generator_text(dataset, das_dir, mcm_dir, conf_dir, recid_info):
"""Return DICT with all information about the generator steps."""

# For MiniAODSIM, find the corresponding Nano and use that information
# Might be best done at the when querying the McM

if dataset.endswith('MINIAODSIM'):
nano_found=0
dataset_first_name = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), 'dataset_name')
for x in os.listdir(mcm_dir + '/chain'):
if x.startswith('@'+dataset_first_name):
dataset = x.replace('@', '/')
nano_found=1

if nano_found==0:
print("A corresponding NANOAODSIM was not found for dataset: " + dataset)
if dataset.endswith('MINIAODSIM'):
dataset = MININANORELATION_CACHE[dataset]

recid = recid_info[dataset]
info = {}
Expand Down Expand Up @@ -259,18 +251,6 @@ def get_all_generator_text(dataset, das_dir, mcm_dir, conf_dir, recid_info):

info["steps"].append(step)

# post-generation fix: if we have LHE step, let's modify the configuration file titles for other steps
# FIXME: is this now dublicate of the condition above?
lhe_present = False
for step in info['steps']:
if lhe_present:
for configuration_file in step.get('configuration_files'):
if configuration_file['title'] == 'Generator parameters':
print("in the lhe_present condition with title Generator parameters not yet changed")
configuration_file['title'] = 'Hadronizer parameters'
if 'LHE' in step['type']:
lhe_present = True

# post-generation fix: keep generators only for the first step, remove from others:
generators_present = False
for step in info['steps']:
Expand All @@ -290,6 +270,21 @@ def populate_containerimages_cache():
for key in content.keys():
CONTAINERIMAGES_CACHE[key] = content[key]

def populate_mininanorelation_cache(dataset_full_names, mcm_dir):
"""Populate MININANORELATION cache (to find the corresponding NANO for provenance, and for dataset -> relations)"""
for dataset_full_name in dataset_full_names:
if dataset_full_name.endswith('MINIAODSIM'):
nano_found = 0
dataset_first_name = get_from_deep_json(get_mcm_dict(dataset_full_name, mcm_dir), 'dataset_name')
for x in os.listdir(mcm_dir + '/chain'):
if x.startswith('@'+dataset_first_name):
dataset_name_for_nano = x.replace('@', '/')
nano_found = 1
MININANORELATION_CACHE[dataset_full_name] = dataset_name_for_nano
if nano_found==0:
print("A corresponding NANOAODSIM was not found for dataset: " + dataset_full_name)


def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir):
"""Create record for the given dataset."""

Expand Down Expand Up @@ -317,8 +312,8 @@ def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm
rec['collections'] = ['CMS-Simulated-Datasets', ]

rec['collision_information'] = {}
rec['collision_information']['energy'] = get_dataset_energy(dataset_full_name, mcm_dir)
rec['collision_information']['type'] = 'pp' # FIXME do not hardcode
rec['collision_information']['energy'] = collision_energy
rec['collision_information']['type'] = collision_type

# FIXME cross section will be read in separately
generator_parameters = get_generator_parameters_from_mcm(dataset_full_name, mcm_dir)
Expand All @@ -332,8 +327,7 @@ def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm

rec['date_created'] = [year_created]
rec['date_published'] = year_published
rec['date_reprocessed'] = year_created # FIXME, this is not correct
# for year_reprocessed: could use the year from "pdmv_submission_date": "220201", or "pdmv_monitor_time": "Sun Feb 06 13:24:33 2022", reqmgr_name etc in dict
rec['date_reprocessed'] = get_data_processing_year(dataset_full_name, mcm_dir)
rec['distribution'] = {}
rec['distribution']['formats'] = [dataset_format.lower(), 'root']
rec['distribution']['number_events'] = get_number_events(dataset_full_name, das_dir)
Expand Down Expand Up @@ -369,10 +363,7 @@ def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm
# For Mini, get the pileup from the corresponding Nano
dataset_name_for_nano = dataset_full_name
if dataset_full_name.endswith('MINIAODSIM'):
dataset_first_name = get_from_deep_json(get_mcm_dict(dataset_full_name, mcm_dir), 'dataset_name')
for x in os.listdir(mcm_dir + '/chain'):
if x.startswith('@'+dataset_first_name):
dataset_name_for_nano = x.replace('@', '/')
dataset_name_for_nano = MININANORELATION_CACHE[dataset_full_name]

pileup_dataset_name= ''
pileup_dataset_name= get_pileup_from_mcm(dataset_name_for_nano, mcm_dir)
Expand Down Expand Up @@ -458,8 +449,8 @@ def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm
'2011': 2011,
'2012': 2011}.get(year_created, 2011)
rec['usage'] = {}
rec['usage']['description'] = "You can access these data through the CMS Open Data container or the CMS Virtual Machine. See the instructions for setting up one of the two alternative environments and getting started in" # FIXME
rec['usage']['links'] = [ # FIXME
rec['usage']['description'] = "You can access these data through the CMS Open Data container or the CMS Virtual Machine. See the instructions for setting up one of the two alternative environments and getting started in"
rec['usage']['links'] = [
{
"description": "Running CMS analysis code using Docker",
"url": "/docs/cms-guide-docker"
Expand Down Expand Up @@ -532,6 +523,7 @@ def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir, doi_file, recid_fil
"Do the job."

populate_containerimages_cache()
populate_mininanorelation_cache(datasets, mcm_dir)

records_dir= "./outputs/records-" + dt.now().strftime("%Y-%m")
os.makedirs(records_dir, exist_ok=True)
Expand Down
11 changes: 11 additions & 0 deletions cms-2016-simulated-datasets/code/mcm_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,17 @@ def get_dataset_energy(dataset, mcm_dir):
2016: '13TeV',
}.get(year, 0)

def get_data_processing_year(dataset, mcm_dir):
"Return year of the processing from McM"
mcm_dict = get_mcm_dict(dataset, mcm_dir)
date = get_from_deep_json(mcm_dict, 'pdmv_submission_date')
if len(date) == 6:
year='20' + date[:2]
return year
else:
print("In get_data_processing_year, unexpected format for date: expecting yymmdd, got :" + date)
return '0000'


def get_generator_name(dataset, mcm_dir):
"Return list of generators used for that dataset"
Expand Down

0 comments on commit 996f971

Please sign in to comment.