From d643ff30101f5be76e4525d8f0825a91e897e7fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tibor=20=C5=A0imko?= Date: Mon, 6 Nov 2023 14:19:23 +0100 Subject: [PATCH] jade-2023-raw-datasets: prepare logbook records Initial version of the JADE logbook record preparation script. To be improved by MPP in the coming weeks. --- .gitignore | 47 +++---- README.rst | 1 + jade-2023-raw-datasets/README.rst | 6 + .../code/create_logbook_records.py | 118 ++++++++++++++++++ .../inputs/eos-file-information-logbooks.txt | 21 ++++ jade-2023-raw-datasets/run.sh | 18 +++ run-tests.sh | 1 + 7 files changed, 189 insertions(+), 23 deletions(-) create mode 100644 jade-2023-raw-datasets/README.rst create mode 100644 jade-2023-raw-datasets/code/create_logbook_records.py create mode 100644 jade-2023-raw-datasets/inputs/eos-file-information-logbooks.txt create mode 100755 jade-2023-raw-datasets/run.sh diff --git a/.gitignore b/.gitignore index 53bdb803d..8f7643e66 100644 --- a/.gitignore +++ b/.gitignore @@ -8,36 +8,31 @@ venv/ *.err cms-2010-collision-datasets/outputs/*.json cms-2010-simulated-datasets/outputs/*.json +cms-2011-collision-datasets-runb-update/inputs/config-store +cms-2011-collision-datasets-runb-update/inputs/das-json-config-store +cms-2011-collision-datasets-runb-update/inputs/das-json-store +cms-2011-collision-datasets-runb-update/outputs/*.json cms-2011-collision-datasets/code/das.py cms-2011-collision-datasets/inputs/das-json-store cms-2011-collision-datasets/outputs/*.xml -cms-2011-collision-datasets-runb-update/inputs/das-json-store -cms-2011-collision-datasets-runb-update/inputs/das-json-config-store -cms-2011-collision-datasets-runb-update/inputs/config-store -cms-2011-collision-datasets-runb-update/outputs/*.json cms-2011-hlt-triggers/outputs/*.html cms-2011-hlt-triggers/outputs/*.xml cms-2011-l1-triggers/outputs/*.xml cms-2011-simulated-datasets/inputs/das-json-store cms-2011-simulated-datasets/outputs/*.xml -cms-2012-collision-datasets/inputs/das-json-store -cms-2012-collision-datasets/outputs/*.json -cms-2012-collision-datasets-update/inputs/das-json-store -cms-2012-collision-datasets-update/inputs/das-json-config-store cms-2012-collision-datasets-update/inputs/config-store +cms-2012-collision-datasets-update/inputs/das-json-config-store +cms-2012-collision-datasets-update/inputs/das-json-store cms-2012-collision-datasets-update/outputs/*.json +cms-2012-collision-datasets/inputs/das-json-store +cms-2012-collision-datasets/outputs/*.json cms-2012-event-display-files/inputs/ig/ cms-2012-event-display-files/outputs/*.json cms-2012-simulated-datasets/inputs/config-store cms-2012-simulated-datasets/inputs/das-json-store +cms-2012-simulated-datasets/outputs/*.json cms-2012-simulated-datasets/outputs/create-config-store.sh cms-2012-simulated-datasets/outputs/create-das-json-store.sh -cms-2012-simulated-datasets/outputs/*.json -cms-2013-hlt-triggers/outputs -cms-2013-simulated-datasets-hi/inputs/das-json-store -cms-2013-simulated-datasets-hi/inputs/mcm-store -cms-2013-simulated-datasets-hi/inputs/config-store -cms-2013-simulated-datasets-hi/outputs/ cms-2013-collision-datasets-hi-ppref/inputs/config-store cms-2013-collision-datasets-hi-ppref/inputs/das-json-config-store cms-2013-collision-datasets-hi-ppref/inputs/das-json-store @@ -46,25 +41,31 @@ cms-2013-collision-datasets-hi/inputs/config-store cms-2013-collision-datasets-hi/inputs/das-json-config-store cms-2013-collision-datasets-hi/inputs/das-json-store cms-2013-collision-datasets-hi/outputs/*.json -cms-2015-collision-datasets/inputs/das-json-store -cms-2015-collision-datasets/inputs/das-json-config-store -cms-2015-collision-datasets/outputs/*.json +cms-2013-hlt-triggers/outputs +cms-2013-simulated-datasets-hi/inputs/config-store +cms-2013-simulated-datasets-hi/inputs/das-json-store +cms-2013-simulated-datasets-hi/inputs/mcm-store +cms-2013-simulated-datasets-hi/outputs/ cms-2015-collision-datasets-hi-ppref/inputs/config-store -cms-2015-collision-datasets-hi-ppref/inputs/das-json-store cms-2015-collision-datasets-hi-ppref/inputs/das-json-config-store +cms-2015-collision-datasets-hi-ppref/inputs/das-json-store cms-2015-collision-datasets-hi-ppref/outputs/*.json +cms-2015-collision-datasets/inputs/das-json-config-store +cms-2015-collision-datasets/inputs/das-json-store +cms-2015-collision-datasets/outputs/*.json +cms-2015-simulated-datasets/inputs/config-store cms-2015-simulated-datasets/inputs/das-json-store cms-2015-simulated-datasets/inputs/mcm-store -cms-2015-simulated-datasets/inputs/config-store -cms-2015-simulated-datasets/outputs/ cms-2015-simulated-datasets/lhe_generators -cod2-to-cod3/outputs/*.json -opera-2017-multiplicity-studies/outputs/opera-events.json +cms-2015-simulated-datasets/outputs/ cms-YYYY-simulated-datasets/cache cms-YYYY-simulated-datasets/outputs/*.csv cms-YYYY-simulated-datasets/outputs/*.err cms-YYYY-simulated-datasets/outputs/*.json cod2-to-cod3/outputs/*.json +cod2-to-cod3/outputs/*.json +jade-2023-raw-datasets/outputs/*.json +opera-2017-multiplicity-studies/outputs/opera-events.json opera-2017-multiplicity-studies/outputs/opera-events.json -opera-2019-neutrino-induced-charm/outputs/opera-events.json opera-2019-electron-neutrinos/outputs/opera-events.json +opera-2019-neutrino-induced-charm/outputs/opera-events.json diff --git a/README.rst b/README.rst index c5c74342f..9f5135852 100644 --- a/README.rst +++ b/README.rst @@ -58,6 +58,7 @@ Specific data ingestion and curation campaigns: - `cms-run2-hlt-triggers `_ -- helper scripts for the CMS Run2 data release (HLT triggers) - `cms-run2-ultra-legacy-production `_ - helper scripts for CMS Run2 ultra-legacy production - `cod2-to-cod3 `_ - record migration from version 2 to version 3 +- `jade-2023-first-release `_ - helper scripts for the initial release of JADE data - `opera-2017-multiplicity-studies `_ - helper scripts for the release of OPERA multiplicity studies - `opera-2019-electron-neutrinos `_ - helper scripts for the release of OPERA electron neutrino events - `opera-2019-neutrino-induced-charm `_ - helper scripts for the release of OPERA charm events diff --git a/jade-2023-raw-datasets/README.rst b/jade-2023-raw-datasets/README.rst new file mode 100644 index 000000000..ef73f562a --- /dev/null +++ b/jade-2023-raw-datasets/README.rst @@ -0,0 +1,6 @@ +======================== + jade-2023-raw-datasets +======================== + +This directory contains helper scripts used to prepare the initial release of +JADE data in 2023. Includes raw datasets with accompanying logbooks and notes. diff --git a/jade-2023-raw-datasets/code/create_logbook_records.py b/jade-2023-raw-datasets/code/create_logbook_records.py new file mode 100644 index 000000000..6665a5eef --- /dev/null +++ b/jade-2023-raw-datasets/code/create_logbook_records.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +""" +Create JADE logbook records. +""" + +import json +import re + +recid_start = 26100 +year_published = "2023" + + +def create_record(recid, path, size, checksum): + """Create record for the given JADE logbook.""" + + rec = {} + + try: + logbook_number = re.match(r"^.*Log([0-9]+)\.pdf", path).groups(0)[0] + except Exception: + logbook_number = "FIXME" + + logbook_period = "1970" + + rec["abstract"] = {} + rec["abstract"][ + "description" + ] = f"This is JADE logbook {logbook_number} from {logbook_period}. FIXME add more description." + + rec["accelerator"] = "DESY-PETRA" + + rec["collaboration"] = {} + rec["collaboration"]["name"] = "JADE collaboration" + rec["collaboration"]["recid"] = "451" + + rec["collections"] = [ + "JADE-Logbooks", + ] + + rec["date_created"] = [ + logbook_period, + ] + rec["date_published"] = year_published + + rec["distribution"] = {} + rec["distribution"]["formats"] = [ + "pdf", + ] + rec["distribution"]["number_files"] = 1 + rec["distribution"]["size"] = size + + rec["experiment"] = "JADE" + + rec["files"] = [] + rec["files"].append( + { + "checksum": "adler32:" + checksum, + "size": size, + "uri": path, + } + ) + + rec["license"] = {} + rec["license"]["attribution"] = "CC0" + + rec["publisher"] = "CERN Open Data Portal" + + rec["recid"] = str(recid) + + rec["title"] = f"JADE logbook number {logbook_number}" + + rec["type"] = {} + rec["type"]["primary"] = "Supplementaries" + rec["type"]["secondary"] = [ + "Logbook", + ] + + return rec + + +def create_records(): + """Create records.""" + with open("./inputs/eos-file-information-logbooks.txt", "r") as f: + records = [] + recid = recid_start + for line in f.readlines(): + match = re.match(r"^path=(.*) size=(.*) checksum=(.*)$", line.strip()) + if match: + path, size, checksum = match.groups() + size = int(size) + records.append(create_record(recid, path, size, checksum)) + recid += 1 + return records + + +def print_records(records): + """Print records.""" + print( + json.dumps( + records, + indent=2, + sort_keys=True, + ensure_ascii=False, + separators=(",", ": "), + ) + ) + + +def main(): + "Do the job." + + records = create_records() + print_records(records) + + +if __name__ == "__main__": + main() diff --git a/jade-2023-raw-datasets/inputs/eos-file-information-logbooks.txt b/jade-2023-raw-datasets/inputs/eos-file-information-logbooks.txt new file mode 100644 index 000000000..1fac05043 --- /dev/null +++ b/jade-2023-raw-datasets/inputs/eos-file-information-logbooks.txt @@ -0,0 +1,21 @@ +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log01.pdf size=37913903 checksum=5537cb30 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log02.pdf size=38811120 checksum=8eb74619 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log03.pdf size=50652976 checksum=970d45ef +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log04.pdf size=50162033 checksum=dbcf6160 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log05.pdf size=47007176 checksum=8af84e12 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log06.pdf size=48306806 checksum=da5c1a3e +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log07.pdf size=50320784 checksum=ae36da39 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log08.pdf size=41087938 checksum=50e06402 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log09.pdf size=41056855 checksum=8c6b28a5 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log10.pdf size=40359158 checksum=a3936655 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log11.pdf size=41105169 checksum=20a2305c +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log12.pdf size=39146329 checksum=9e547ac6 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log13.pdf size=38893971 checksum=8004225d +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log14.pdf size=43056879 checksum=ed017f53 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log15.pdf size=41640363 checksum=8f39d465 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log16.pdf size=39921243 checksum=9f2a1c0f +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log17.pdf size=43588114 checksum=9efd7aa2 +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log18.pdf size=40612426 checksum=3b083c0b +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log19.pdf size=42436620 checksum=996c06fe +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log20.pdf size=40799092 checksum=ea108b6a +path=root://eospublic.cern.ch//eos/opendata/jade/documentation/logbooks/Log21.pdf size=17001411 checksum=5d91f901 diff --git a/jade-2023-raw-datasets/run.sh b/jade-2023-raw-datasets/run.sh new file mode 100755 index 000000000..865b1923c --- /dev/null +++ b/jade-2023-raw-datasets/run.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +## 1) create EOS file indexes: + +# lxplus> eos find --xurl --size --checksum /eos/opendata/jade/upload/documentation/LogBooks | grep pdf > ./inputs/eos-file-information-logbooks.txt + +## 2) create JADE logbook records + +mkdir -p outputs +python ./code/create_logbook_records.py > ./outputs/jade-logbooks.json + +## 6) check the validity of resulting JSON files + +jsonlint -q ./outputs/*.json + +## 7) copy them to CERN Open Data fixtures directory + +\cp outputs/*.json ../../opendata.cern.ch/cernopendata/modules/fixtures/data/records diff --git a/run-tests.sh b/run-tests.sh index 04f24e187..d8af8b439 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -12,6 +12,7 @@ check_black() { cms-YYYY-run-numbers/code/*.py \ cms-2013-collision-datasets-hi-ppref/code/*.py \ cms-2015-collision-datasets-hi-ppref/code/*.py \ + jade-2023-raw-datasets/code/*.py \ utils/*.py }