From ff0fa88e01b0741959617a909c2ec356fbf33012 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 23 Sep 2024 07:00:43 -0700 Subject: [PATCH 01/20] dev harvester to use local dev pilot if exists --- .../harvester/sandbox/lsst.rubin-srun.sh | 11 +++++++++-- .../harvester/sandbox/rubin-voms.config.tar | Bin 20480 -> 20480 bytes 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh b/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh index 51602a36..91b4c77b 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh +++ b/helm/harvester/charts/harvester/sandbox/lsst.rubin-srun.sh @@ -84,8 +84,15 @@ fi # env echo -# cmd="$cmd --export=ALL /cvmfs/sw.lsst.eu/linux-x86_64/panda_env/v1.0.9/pilot/wrapper/rubin-wrapper.sh $@" -cmd="$cmd --export=ALL ${latest}/pilot/wrapper/rubin-wrapper.sh $@" + +# check if there is a local dev pilot +pilot_wrapper_local=/sdf/data/rubin/panda_jobs/panda_env_pilot/pilot_wrapper/rubin-wrapper.sh +if [[ -f ${pilot_wrapper_local} ]]; then + cmd="$cmd --export=ALL ${pilot_wrapper_local} $@" +else + # cmd="$cmd --export=ALL /cvmfs/sw.lsst.eu/linux-x86_64/panda_env/v1.0.9/pilot/wrapper/rubin-wrapper.sh $@" + cmd="$cmd --export=ALL ${latest}/pilot/wrapper/rubin-wrapper.sh $@" +fi echo $cmd ntasks=${ntasks_total} diff --git a/helm/harvester/charts/harvester/sandbox/rubin-voms.config.tar b/helm/harvester/charts/harvester/sandbox/rubin-voms.config.tar index 561fc8d086463cc3c44ce5a59fce4bca69061df1..72053ffb037f81be1807f7e91464d5fafd177a39 100644 GIT binary patch delta 389 zcmaiwu?oU46h)JeYN$~ YFZ75y+6xJ7B4)9l_0QF$?jLlLFX&!dS^xk5 delta 499 zcmaixJx;?w5JtT#J6M*5NFq99qNH&uX7*QfAkout0F(|eNJc4ANPiCAOL7)3Db%gMBdJoBHFs*8?18?@@%}0M7R^Qf-srV5 k!8N#>&2=D0FT?z`u0N-f>GJ)pM!KJ0(qs9&U(8130R-Q4!vFvP From 29385a87b8b793608b366ab8872236061cff8b02 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 23 Sep 2024 07:01:21 -0700 Subject: [PATCH 02/20] fix to link local harvester logs --- helm/harvester/values/values-lsst-prod.yaml | 2 +- helm/harvester/values/values-lsst.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/helm/harvester/values/values-lsst-prod.yaml b/helm/harvester/values/values-lsst-prod.yaml index b965de72..775d7161 100644 --- a/helm/harvester/values/values-lsst-prod.yaml +++ b/helm/harvester/values/values-lsst-prod.yaml @@ -22,7 +22,7 @@ harvester: mount: true class: sdf-data-rubin path: "/mnt/harvester-data" - logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_SDF" + logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_S3DF" size: 5Gi experiment: "lsst_prod" diff --git a/helm/harvester/values/values-lsst.yaml b/helm/harvester/values/values-lsst.yaml index a8f22075..90997783 100644 --- a/helm/harvester/values/values-lsst.yaml +++ b/helm/harvester/values/values-lsst.yaml @@ -21,7 +21,7 @@ harvester: mount: true class: sdf-data-rubin path: "/mnt/harvester-data" - logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_SDF" + logpath: "/mnt/harvester-data/panda_jobs/harvester_workdir_k8s/harvester_wdirs/SLAC_Harvester_S3DF" size: 5Gi experiment: "lsst" From 0176d7a63693b43621bbbf933ca8e24707e3e632 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 23 Sep 2024 07:01:41 -0700 Subject: [PATCH 03/20] dev harvester to use loki realtime logging --- helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_pull.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_push.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_slac.sdf | 2 +- helm/harvester/charts/mariadb/templates/statefulset.yaml | 1 + 7 files changed, 7 insertions(+), 6 deletions(-) diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf index f880a346..3a941f62 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf @@ -7,7 +7,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf index af516c21..7f1fa48e 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf @@ -10,7 +10,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf index 0d0aab45..287dcee9 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf @@ -1,6 +1,6 @@ executable = /opt/harvester/sandbox/lsst.rubin-srun.sh -arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf index b4db94af..e8c4e556 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf @@ -9,7 +9,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf index abeb6a99..18ffa72c 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf @@ -9,7 +9,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf index 068b67f7..ee66630a 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_slac.sdf @@ -10,7 +10,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper_slac.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-dev.tar.gz" -arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server loki --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/mariadb/templates/statefulset.yaml b/helm/harvester/charts/mariadb/templates/statefulset.yaml index 96474656..ab30fd05 100644 --- a/helm/harvester/charts/mariadb/templates/statefulset.yaml +++ b/helm/harvester/charts/mariadb/templates/statefulset.yaml @@ -47,6 +47,7 @@ spec: {{- if not .Values.autoscaling.enabled }} replicas: {{ .Values.replicaCount }} {{- end }} + serviceName: {{ include "mariadb.fullname" . }} selector: matchLabels: {{- include "mariadb.selectorLabels" . | nindent 6 }} From 5b646753f8fe0a228e588efb630944d4f7171744 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Sun, 29 Sep 2024 09:39:04 -0700 Subject: [PATCH 04/20] disable loki realtime logging --- helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf | 4 ++-- .../charts/harvester/sandbox/lsst.submit_pilot_pull.sdf | 4 ++-- .../charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf | 4 +++- .../charts/harvester/sandbox/lsst.submit_pilot_push.sdf | 4 ++-- .../charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf | 4 ++-- helm/panda/charts/server/panda_server_config.json | 1 + 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf index 3a941f62..e0ffa919 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf @@ -5,9 +5,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -w generic --pilot-user rubin --url https://pandaserver-doma.cern.ch -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy" -# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog" +# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf index 7f1fa48e..b053a431 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf @@ -8,9 +8,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -# arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +# arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf index 287dcee9..ff783f4b 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull_srun.sdf @@ -1,6 +1,8 @@ executable = /opt/harvester/sandbox/lsst.rubin-srun.sh -arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" + +# arguments = "--ntasks-total {nCoreTotal} --ntasks 1 --cpus-per-task 1 --mem-per-cpu {requestRamPerCore} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf index e8c4e556..3a070c75 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf @@ -7,9 +7,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf index 18ffa72c..cdbcf1d4 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf @@ -7,9 +7,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server loki;https://sdfloki.slac.stanford.edu:80 --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +# arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} diff --git a/helm/panda/charts/server/panda_server_config.json b/helm/panda/charts/server/panda_server_config.json index ce0502be..6b6e67ec 100644 --- a/helm/panda/charts/server/panda_server_config.json +++ b/helm/panda/charts/server/panda_server_config.json @@ -5,6 +5,7 @@ "CRIC_URL_DDMENDPOINTS": "$PANDA_CRIC_DDMENDPOINTS", "CRIC_URL_SCHEDCONFIG": "$PANDA_CRIC_SCHEDCONFIG", "CRIC_URL_SITES": "$PANDA_CRIC_SITES", + "RUCIO_RSE_USAGE": "/opt/panda/sandbox/rucio_rse_usage.json", "adder_plugins": "wlcg:dataservice.AdderDummyPlugin:AdderDummyPlugin", "backend": "postgres", "schemaPANDA": "$PANDA_DB_SCHEMAPANDA", From 69c484465e392fb93df2cb832fcb988a104a48ac Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Mon, 14 Oct 2024 07:38:50 -0700 Subject: [PATCH 05/20] merge and fix add_dumy_plugin --- helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_pull.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_push.sdf | 2 +- .../charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf | 2 +- helm/panda/charts/server/panda_server_config.json | 2 +- helm/panda/values/values-lsst.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf index e0ffa919..ffaf7644 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot.sdf @@ -5,7 +5,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -w generic --pilot-user rubin --url https://pandaserver-doma.cern.ch -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog" # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog" diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf index b053a431..2abc1f53 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf @@ -8,7 +8,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" # arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf index 3a070c75..7f18d2e7 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push.sdf @@ -7,7 +7,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf index cdbcf1d4..bb3657b3 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_push_96.sdf @@ -7,7 +7,7 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PUSH --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" diff --git a/helm/panda/charts/server/panda_server_config.json b/helm/panda/charts/server/panda_server_config.json index 6b6e67ec..ba6e8ed4 100644 --- a/helm/panda/charts/server/panda_server_config.json +++ b/helm/panda/charts/server/panda_server_config.json @@ -23,7 +23,7 @@ "dbpasswd": "$PANDA_DB_PASSWORD", "dbuser": "$PANDA_DB_USER", "dbname": "$PANDA_DB_NAME", - "adder_plugins": "wlcg:dataservice.adder_simple_plugin:AdderDummyPlugin,wlcg:dataservice.adder_simple_plugin:AdderSimplePlugin:sphenix", + "adder_plugins": "wlcg:dataservice.adder_dummy_plugin:AdderDummyPlugin,wlcg:dataservice.adder_simple_plugin:AdderSimplePlugin:sphenix", "setupper_plugins": "wlcg:dataservice.setupper_dummy_plugin:SetupperDummyPlugin", "token_authType": "oidc", "sandboxHostname": "$PANDA_HOSTNAME", diff --git a/helm/panda/values/values-lsst.yaml b/helm/panda/values/values-lsst.yaml index f1c975f6..f2e0da20 100644 --- a/helm/panda/values/values-lsst.yaml +++ b/helm/panda/values/values-lsst.yaml @@ -19,7 +19,7 @@ jedi: memory: 32Gi server: - replicaCount: 2 + replicaCount: 1 persistentvolume: class: wekafs--sdf-k8s01 create: false From af9544e5b5d541269756b346a3ccc338b03fee41 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 01:52:05 -0800 Subject: [PATCH 06/20] fix dbaccess --- helm/bigmon/charts/main/sandbox/local.py.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/bigmon/charts/main/sandbox/local.py.template b/helm/bigmon/charts/main/sandbox/local.py.template index 786a701d..058669f4 100755 --- a/helm/bigmon/charts/main/sandbox/local.py.template +++ b/helm/bigmon/charts/main/sandbox/local.py.template @@ -42,7 +42,7 @@ dbaccess_postgres = { } # Oracle or Postgres -dbaccess = ${BIGMON_DB_ACCESS} +dbaccess = '${BIGMON_DB_ACCESS}' #object store OBJECT_STORE = { From 1f39a856e3e0b80fc0045d97c7404dcb8b70a423 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 01:53:27 -0800 Subject: [PATCH 07/20] add idds live probe --- helm/idds/charts/rest/templates/statefulset.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/helm/idds/charts/rest/templates/statefulset.yaml b/helm/idds/charts/rest/templates/statefulset.yaml index 1160f42a..664bbeb6 100644 --- a/helm/idds/charts/rest/templates/statefulset.yaml +++ b/helm/idds/charts/rest/templates/statefulset.yaml @@ -85,6 +85,13 @@ spec: runuser -u atlpan -g zp -- /opt/idds/bin/start-daemon.sh all {{- end}} {{ end -}} + livenessProbe: + exec: + command: + - cat + - /var/log/idds/idds_health + initialDelaySeconds: 600 + periodSeconds: 600 ports: - name: https containerPort: 8443 From 487022259316818d001fcc236ea5cdff217653ae Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 01:53:55 -0800 Subject: [PATCH 08/20] upgrade idds --- helm/idds/charts/rest/idds_configmap.json | 3 ++- helm/idds/values.yaml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/helm/idds/charts/rest/idds_configmap.json b/helm/idds/charts/rest/idds_configmap.json index 354ff7a5..487c45e5 100644 --- a/helm/idds/charts/rest/idds_configmap.json +++ b/helm/idds/charts/rest/idds_configmap.json @@ -27,7 +27,8 @@ "username": "${IDDS_RECEIVER_USERNAME}", "password": "${IDDS_RECEIVER_PASSWORD}", "broker_timeout": 600}}, - "domapandawork.poll_panda_jobs_chunk_size": 2000 + "domapandawork.poll_panda_jobs_chunk_size": 2000, + "domapandawork.site_to_cloud": "SLAC:US,LANCS:EU,CC-IN2P3:EU,RAL:EU" }, "conductor": {"threshold_to_release_messages": 1000, diff --git a/helm/idds/values.yaml b/helm/idds/values.yaml index fb5c3302..8a384995 100644 --- a/helm/idds/values.yaml +++ b/helm/idds/values.yaml @@ -8,7 +8,7 @@ global: rest: enabled: true image: - tag: "2.1.30" + tag: "2.2.7" resources: limits: From 80d5af2c1401b9e44ef6f0a11426842b30d107fc Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 01:58:42 -0800 Subject: [PATCH 09/20] update panda logrotation cron job to run daily --- helm/panda/charts/jedi/sandbox/run-jedi-crons | 2 +- helm/panda/charts/server/sandbox/run-panda-crons | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/helm/panda/charts/jedi/sandbox/run-jedi-crons b/helm/panda/charts/jedi/sandbox/run-jedi-crons index 2dad5de1..3e7157d9 100644 --- a/helm/panda/charts/jedi/sandbox/run-jedi-crons +++ b/helm/panda/charts/jedi/sandbox/run-jedi-crons @@ -2,7 +2,7 @@ tmpExe=/data/panda/run-jedi-crons-tmp-exe cat <> ${tmpExe} while true; do sleep 36000; /opt/panda/bin/panda_common-install_igtf_ca > /var/log/panda/install_igtf_ca.log 2>&1; done & -while true; do /usr/sbin/logrotate /data/panda/logrotate-jedi >> /var/log/panda/logrotate.log 2>&1; sleep 3600; done & +while true; do /usr/sbin/logrotate /data/panda/logrotate-jedi >> /var/log/panda/logrotate.log 2>&1; sleep 86400; done & EOT chmod +x ${tmpExe} diff --git a/helm/panda/charts/server/sandbox/run-panda-crons b/helm/panda/charts/server/sandbox/run-panda-crons index 3f29e076..10086c55 100644 --- a/helm/panda/charts/server/sandbox/run-panda-crons +++ b/helm/panda/charts/server/sandbox/run-panda-crons @@ -3,7 +3,7 @@ tmpExe=/data/panda/run-panda-crons-tmp-exe cat <> ${tmpExe} while true; do sleep 36000; /opt/panda/bin/panda_common-install_igtf_ca > /var/log/panda/install_igtf_ca.log 2>&1; done & while true; do /opt/cacheschedconfig/bin/cacheSC.sh >> /var/log/panda/cacheSC.out 2>&1; sleep 60; done & -while true; do /usr/sbin/logrotate /data/panda/logrotate-panda >> /var/log/panda/logrotate.log 2>&1; sleep 3600; done & +while true; do /usr/sbin/logrotate /data/panda/logrotate-panda >> /var/log/panda/logrotate.log 2>&1; sleep 86400; done & EOT chmod +x ${tmpExe} From cc4e395ab135339eb2902f0898e63d12c26cc97f Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 01:59:58 -0800 Subject: [PATCH 10/20] upgrade panda and fix CRIC environment variables --- helm/panda/charts/server/panda_server_config.json | 2 +- helm/panda/values.yaml | 4 ++-- helm/panda/values/values-lsst.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helm/panda/charts/server/panda_server_config.json b/helm/panda/charts/server/panda_server_config.json index ba6e8ed4..3ae03414 100644 --- a/helm/panda/charts/server/panda_server_config.json +++ b/helm/panda/charts/server/panda_server_config.json @@ -5,8 +5,8 @@ "CRIC_URL_DDMENDPOINTS": "$PANDA_CRIC_DDMENDPOINTS", "CRIC_URL_SCHEDCONFIG": "$PANDA_CRIC_SCHEDCONFIG", "CRIC_URL_SITES": "$PANDA_CRIC_SITES", + "CRIC_URL_TAGS": "$PANDA_CRIC_URLTAGS", "RUCIO_RSE_USAGE": "/opt/panda/sandbox/rucio_rse_usage.json", - "adder_plugins": "wlcg:dataservice.AdderDummyPlugin:AdderDummyPlugin", "backend": "postgres", "schemaPANDA": "$PANDA_DB_SCHEMAPANDA", "schemaMETA": "$PANDA_DB_SCHEMAMETA", diff --git a/helm/panda/values.yaml b/helm/panda/values.yaml index a9121693..ebc0349f 100644 --- a/helm/panda/values.yaml +++ b/helm/panda/values.yaml @@ -11,7 +11,7 @@ jedi: # container image and tag image: - tag: "0.4.3" + tag: "0.4.5" # tag: "master" # PV with selector support @@ -32,7 +32,7 @@ server: # container image and tag image: - tag: "0.3.20" + tag: "0.4.2" # tag: "master" # PV with selector support diff --git a/helm/panda/values/values-lsst.yaml b/helm/panda/values/values-lsst.yaml index f2e0da20..f1c975f6 100644 --- a/helm/panda/values/values-lsst.yaml +++ b/helm/panda/values/values-lsst.yaml @@ -19,7 +19,7 @@ jedi: memory: 32Gi server: - replicaCount: 1 + replicaCount: 2 persistentvolume: class: wekafs--sdf-k8s01 create: false From c07aad5381807b06c644dac030a690a17258735b Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:03:06 -0800 Subject: [PATCH 11/20] disable pilots when no activated jobs for RAL queues --- .../harvester/queueconfig/lsst.panda_queueconfig.json | 10 +++++----- .../queueconfig/lsst_prod.panda_queueconfig.json | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json b/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json index 5c003ba6..0644dbca 100644 --- a/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json +++ b/helm/harvester/charts/harvester/queueconfig/lsst.panda_queueconfig.json @@ -1014,7 +1014,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "ceQueueName": "EL9", "ceARCGridType": "arc", "ceHostname": ["arc-ce01.gridpp.rl.ac.uk", "arc-ce02.gridpp.rl.ac.uk"], @@ -1042,7 +1042,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1071,7 +1071,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1100,7 +1100,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1129,7 +1129,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", diff --git a/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json b/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json index e279f77e..62900167 100644 --- a/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json +++ b/helm/harvester/charts/harvester/queueconfig/lsst_prod.panda_queueconfig.json @@ -1012,7 +1012,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "ceQueueName": "EL9", "ceARCGridType": "arc", "ceHostname": ["arc-ce01.gridpp.rl.ac.uk", "arc-ce02.gridpp.rl.ac.uk"], @@ -1040,7 +1040,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1069,7 +1069,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1098,7 +1098,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", @@ -1127,7 +1127,7 @@ "submitter": { "nCore": 1, "nCorePerNode": 1, - "noPilotsWhenNoActiveJobs": false, + "noPilotsWhenNoActiveJobs": true, "submitMode": "PULL", "ceQueueName": "EL9", "ceARCGridType": "arc", From 97462a766f331a56f7cc27401ec1d2bfb10b0cca Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:03:54 -0800 Subject: [PATCH 12/20] disable harvester k8s configuration --- .../harvester/sandbox/lsst.init-harvester | 38 +++++++++---------- .../sandbox/lsst_prod.init-harvester | 38 +++++++++---------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/helm/harvester/charts/harvester/sandbox/lsst.init-harvester b/helm/harvester/charts/harvester/sandbox/lsst.init-harvester index e303e276..491ed12a 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.init-harvester +++ b/helm/harvester/charts/harvester/sandbox/lsst.init-harvester @@ -22,25 +22,25 @@ yes|cp -fr etc/grid-security/vomsdir/lsst/* /etc/grid-security/vomsdir/lsst/ cd /data/harvester # gcloud config -cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester -tar xzf gcloud_config.tar.gz -chmod 777 -R /data/harvester/gcloud_config +# cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester +# tar xzf gcloud_config.tar.gz +# chmod 777 -R /data/harvester/gcloud_config # k8s config -cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester -tar xzf k8s.tar.gz -chmod 777 -R /data/harvester/k8s - -export CLOUDSDK_CONFIG=/data/harvester/gcloud_config -export KUBECONFIG=/data/harvester/gcloud_config/.kube - -mkdir -p /data/harvester/gcloud_config_rubin/ -for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt -do - export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue - gcloud container clusters get-credentials --region=us-central1 $queue - chmod og+rw $KUBECONFIG -done - -export KUBECONFIG=/data/harvester/gcloud_config/.kube +# cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester +# tar xzf k8s.tar.gz +# chmod 777 -R /data/harvester/k8s + +# export CLOUDSDK_CONFIG=/data/harvester/gcloud_config +# export KUBECONFIG=/data/harvester/gcloud_config/.kube + +# mkdir -p /data/harvester/gcloud_config_rubin/ +# for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt +# do +# export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue +# gcloud container clusters get-credentials --region=us-central1 $queue +# chmod og+rw $KUBECONFIG +# done + +# export KUBECONFIG=/data/harvester/gcloud_config/.kube diff --git a/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester b/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester index e303e276..491ed12a 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester +++ b/helm/harvester/charts/harvester/sandbox/lsst_prod.init-harvester @@ -22,25 +22,25 @@ yes|cp -fr etc/grid-security/vomsdir/lsst/* /etc/grid-security/vomsdir/lsst/ cd /data/harvester # gcloud config -cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester -tar xzf gcloud_config.tar.gz -chmod 777 -R /data/harvester/gcloud_config +# cp /opt/harvester/etc/auth/gcloud_config.tar.gz /data/harvester +# tar xzf gcloud_config.tar.gz +# chmod 777 -R /data/harvester/gcloud_config # k8s config -cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester -tar xzf k8s.tar.gz -chmod 777 -R /data/harvester/k8s - -export CLOUDSDK_CONFIG=/data/harvester/gcloud_config -export KUBECONFIG=/data/harvester/gcloud_config/.kube - -mkdir -p /data/harvester/gcloud_config_rubin/ -for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt -do - export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue - gcloud container clusters get-credentials --region=us-central1 $queue - chmod og+rw $KUBECONFIG -done - -export KUBECONFIG=/data/harvester/gcloud_config/.kube +# cp /opt/harvester/etc/auth/k8s.tar.gz /data/harvester +# tar xzf k8s.tar.gz +# chmod 777 -R /data/harvester/k8s + +# export CLOUDSDK_CONFIG=/data/harvester/gcloud_config +# export KUBECONFIG=/data/harvester/gcloud_config/.kube + +# mkdir -p /data/harvester/gcloud_config_rubin/ +# for queue in moderatemem highmem extra-highmem merge highmem-non-preempt developmentcluster extra-highmem-non-preempt +# do +# export KUBECONFIG=/data/harvester/gcloud_config_rubin/$queue +# gcloud container clusters get-credentials --region=us-central1 $queue +# chmod og+rw $KUBECONFIG +# done + +# export KUBECONFIG=/data/harvester/gcloud_config/.kube From ca934bffc3ec6a72bb3ce2246e2cda9bc2e44c23 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:04:47 -0800 Subject: [PATCH 13/20] fix harvester submit pilots pull --- .../charts/harvester/sandbox/lsst.submit_pilot_pull.sdf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf index 2abc1f53..331b06b5 100644 --- a/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf +++ b/helm/harvester/charts/harvester/sandbox/lsst.submit_pilot_pull.sdf @@ -8,9 +8,9 @@ executable = /opt/harvester/sandbox/lsst.rubin-wrapper.sh # arguments = "-s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t -w generic --pilot-user rubin --url https://panda-dev-server.slac.stanford.edu -d --harvester-submit-mode PUSH {pilotResourceTypeOption} --queuedata-url http://pandaserver-doma.cern.ch:25080/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --pilotversion 3 --pythonversion 3 --localpy -p 443 --job-type={jobType} --piloturl http://cern.ch/atlas-panda-pilot/pilot3-PRE.tar.gz" -arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +# arguments = "--pilotnum {nCoreTotal} -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://datalake-cric.cern.ch/api/atlas/ddmendpoint/query/?json --use-realtime-logging --realtime-logging-server google-cloud-logging --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" -# arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" +arguments = " -s {computingSite} -r {computingSite} -q {pandaQueueName} -j {prodSourceLabel} -i {pilotType} -t --es-executor-type fineGrainedProc -w generic --pilot-user rubin --allow-same-user false --url https://rubin-panda-server-dev.slac.stanford.edu:8443 -d --harvester-submit-mode PULL --queuedata-url https://rubin-panda-server-dev.slac.stanford.edu:8443/cache/schedconfig/{computingSite}.all.json --storagedata-url https://none-url --use-realtime-logging --realtime-logging-server logserver='loki;https://sdfloki.slac.stanford.edu:80' --realtime-logname Panda-RubinLog --pilotversion 3 --pythonversion 3 --localpy" initialdir = {accessPoint} From ffb1aa07025938909f6c0a78647f3ff4d42a1e41 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:07:02 -0800 Subject: [PATCH 14/20] add harvester live probe --- .../harvester/sandbox/health_monitor.py | 122 ++++++++++++++++++ .../harvester/sandbox/health_monitor.sh | 6 + .../harvester/templates/statefulset.yaml | 7 + 3 files changed, 135 insertions(+) create mode 100644 helm/harvester/charts/harvester/sandbox/health_monitor.py create mode 100644 helm/harvester/charts/harvester/sandbox/health_monitor.sh diff --git a/helm/harvester/charts/harvester/sandbox/health_monitor.py b/helm/harvester/charts/harvester/sandbox/health_monitor.py new file mode 100644 index 00000000..600be822 --- /dev/null +++ b/helm/harvester/charts/harvester/sandbox/health_monitor.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +""" +check harvester health +""" + +import os +import re +import subprocess + + +def check_command(command, check_string): + print("Checking command : {0}".format(command)) + print("For string : {0}".format(check_string)) + + tmp_array = command.split() + output = ( + subprocess.Popen(tmp_array, stdout=subprocess.PIPE) + .communicate()[0] + .decode("ascii") + ) + + if re.search(check_string, output): + print("Found the string, return 100") + return 100 + else: + print("String not found, return 0") + return 0 + + +def uwsgi_process_availability(): + # check the uwsgi + process_avail = 0 + output = ( + subprocess.Popen( + "ps -eo pgid,args | grep uwsgi | grep -v grep", + stdout=subprocess.PIPE, + shell=True, + ) + .communicate()[0] + .decode("ascii") + ) + count = 0 + for line in output.split("\n"): + line = line.strip() + if line == "": + continue + count += 1 + if count >= 1: + process_avail = 100 + + print("uwsgi process check availability: %s" % process_avail) + return process_avail + + +def condor_process_availability(): + # check the condor + process_avail = 0 + output = ( + subprocess.Popen( + "ps -eo pgid,args | grep condor_schedd | grep -v grep", + stdout=subprocess.PIPE, + shell=True, + ) + .communicate()[0] + .decode("ascii") + ) + count = 0 + for line in output.split("\n"): + line = line.strip() + if line == "": + continue + count += 1 + if count >= 1: + process_avail = 100 + + print("condor_q process check availability: %s" % process_avail) + return process_avail + + +def condor_q_availability(): + # check the condor_q + process_avail = 0 + try: + result = subprocess.run( + ["condor_q"], + timeout=10, # Timeout in seconds + capture_output=True, + text=True + ) + print(f"command output: {result.stdout}") + process_avail = 100 + except subprocess.TimeoutExpired: + print("The command timed out!") + process_avail = 0 + + print("condor_q process check availability: %s" % process_avail) + return process_avail + + +def main(): + uwsgi_avail, condor_avail, condor_q_avail = 0, 0, 0 + try: + uwsgi_avail = uwsgi_process_availability() + condor_avail = condor_process_availability() + condor_q_avail = condor_q_availability() + except Exception as ex: + print(f"failed to check availability: {ex}") + + print(f"uwsgi_avail: {uwsgi_avail}, condor_avail: {condor_avail}, condor_q_avail: {condor_q_avail}") + + health_monitor_file = "/var/log/panda/harvester_healthy" + if uwsgi_avail and condor_avail and condor_q_avail: + with open(health_monitor_file, 'w') as f: + f.write("OK") + else: + if os.path.exists(health_monitor_file): + os.remove(health_monitor_file) + + +if __name__ == '__main__': + main() diff --git a/helm/harvester/charts/harvester/sandbox/health_monitor.sh b/helm/harvester/charts/harvester/sandbox/health_monitor.sh new file mode 100644 index 00000000..d365f669 --- /dev/null +++ b/helm/harvester/charts/harvester/sandbox/health_monitor.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +source /opt/harvester/bin/activate +source /data/condor/condor/condor.sh + +python /data/harvester/health_monitor.py diff --git a/helm/harvester/charts/harvester/templates/statefulset.yaml b/helm/harvester/charts/harvester/templates/statefulset.yaml index dea19458..e787741f 100644 --- a/helm/harvester/charts/harvester/templates/statefulset.yaml +++ b/helm/harvester/charts/harvester/templates/statefulset.yaml @@ -180,6 +180,13 @@ spec: {{- end }} image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + livenessProbe: + exec: + command: + - cat + - /var/log/panda/harvester_healthy + initialDelaySeconds: 1800 + periodSeconds: 600 {{- if .Values.autoStart }} command: ["/bin/sh", "-c"] args: From 4f9c2104e9b77b2f7d05147bf45a20385b5b769d Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:07:55 -0800 Subject: [PATCH 15/20] add rucio_rse_usage.json file in panda sandbox --- helm/panda/sandbox/rucio_rse_usage.json | 1 + 1 file changed, 1 insertion(+) create mode 100644 helm/panda/sandbox/rucio_rse_usage.json diff --git a/helm/panda/sandbox/rucio_rse_usage.json b/helm/panda/sandbox/rucio_rse_usage.json new file mode 100644 index 00000000..935b8819 --- /dev/null +++ b/helm/panda/sandbox/rucio_rse_usage.json @@ -0,0 +1 @@ +{"null": "null"} From 4794aeaf3784ebf3c97dee36b211d93b88be94ed Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:08:38 -0800 Subject: [PATCH 16/20] add harvester monitor cron job --- helm/harvester/charts/harvester/sandbox/run-harvester-crons | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/helm/harvester/charts/harvester/sandbox/run-harvester-crons b/helm/harvester/charts/harvester/sandbox/run-harvester-crons index 09121868..89e38722 100644 --- a/helm/harvester/charts/harvester/sandbox/run-harvester-crons +++ b/helm/harvester/charts/harvester/sandbox/run-harvester-crons @@ -6,6 +6,10 @@ while true; do sleep 36000; /opt/harvester/bin/panda_common-install_igtf_ca > /v # log rotate while true; do /usr/sbin/logrotate /data/harvester/logrotate-harvester >> /var/log/panda/logrotate.log 2>&1; sleep 3600; done & +# health monitor + +while true; do bash /data/harvester/health_monitor.sh >> /var/log/panda/health_monitor.log 2>&1; sleep 600; done & + # experiment specific if [[ ! -z "${EXPERIMENT}" ]]; then CurrentDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" From 430f47b9c21ff64d60ac9d3766431c93bf4cc988 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:09:07 -0800 Subject: [PATCH 17/20] upgrade harvester --- helm/harvester/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/harvester/values.yaml b/helm/harvester/values.yaml index 8e29a75c..c4b881f8 100644 --- a/helm/harvester/values.yaml +++ b/helm/harvester/values.yaml @@ -10,7 +10,7 @@ harvester: enabled: true # container image and tag image: - tag: "v0.5.6" + tag: "v0.5.14" # tag: "master" # PV with selector support From 3a41610841697bae79151215f01bf9aabec247e0 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 02:09:25 -0800 Subject: [PATCH 18/20] upgrade bigmon --- helm/bigmon/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/bigmon/values.yaml b/helm/bigmon/values.yaml index 4125b049..ba4f1661 100644 --- a/helm/bigmon/values.yaml +++ b/helm/bigmon/values.yaml @@ -9,7 +9,7 @@ main: enabled: true image: - tag: "v0.6.17" + tag: "v0.6.20" autoStart: true From 644118b7ad94f3d4311dd0c80a4dad1023b68a2b Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Thu, 21 Nov 2024 04:05:59 -0800 Subject: [PATCH 19/20] upgrade bigmon --- helm/bigmon/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/bigmon/values.yaml b/helm/bigmon/values.yaml index ba4f1661..d3511866 100644 --- a/helm/bigmon/values.yaml +++ b/helm/bigmon/values.yaml @@ -9,7 +9,7 @@ main: enabled: true image: - tag: "v0.6.20" + tag: "v0.6.22" autoStart: true From 65a14c26314b5ab6ca26df18077fe76ff2ff5698 Mon Sep 17 00:00:00 2001 From: Wen Guan Date: Fri, 22 Nov 2024 04:52:07 -0800 Subject: [PATCH 20/20] upgrade bigmon to 0.6.25 --- helm/bigmon/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/bigmon/values.yaml b/helm/bigmon/values.yaml index d3511866..8124f5c4 100644 --- a/helm/bigmon/values.yaml +++ b/helm/bigmon/values.yaml @@ -9,7 +9,7 @@ main: enabled: true image: - tag: "v0.6.22" + tag: "v0.6.25" autoStart: true