Merge pull request #204 from wguanicedew/master

add activate and core factor
HSF · Nov 22, 2023 · fe516ec · fe516ec
2 parents ec03a96 + e943301
commit fe516ec
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 11 deletions.
diff --git a/.github/workflows/docker-publish-al.yml b/.github/workflows/docker-publish-al.yml
@@ -0,0 +1,61 @@
+name: Docker-alma9
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+on:
+  release:
+    types: [published]
+
+  workflow_dispatch:
+
+env:
+  # Use docker.io for Docker Hub if empty
+  REGISTRY: ghcr.io
+  # github.repository as <account>/<repo>
+  IMAGE_NAME: ${{ github.repository }}-alma9
+
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      # Login against a Docker registry except on PR
+      # https://github.com/docker/login-action
+      - name: Log into registry ${{ env.REGISTRY }}
+        if: github.event_name != 'pull_request'
+        uses: docker/login-action@v2
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # Extract metadata (tags, labels) for Docker
+      # https://github.com/docker/metadata-action
+      - name: Extract Docker metadata
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      # Build and push Docker image with Buildx (don't push on PR)
+      # https://github.com/docker/build-push-action
+      - name: Build and push Docker image
+        id: build-and-push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: Dockerfile.al9
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile.al9 b/Dockerfile.al9
@@ -2,19 +2,17 @@
 
 ARG PYTHON_VERSION=3.11.4
 
-FROM docker.io/almalinux:9
+FROM docker.io/almalinux:9.2
 
 ARG PYTHON_VERSION
 
 RUN yum update -y
 RUN yum install -y epel-release
-RUN yum install -y --allowerasing gcc make less git psmisc curl voms-clients-cpp wget httpd logrotate procps mod_ssl \
-    openssl-devel readline-devel bzip2-devel libffi-devel zlib-devel passwd
+RUN yum install -y yum-utils
+RUN yum-config-manager --enable crb
 
-# install mysql-community for CC7+Python3.11
-RUN wget https://dev.mysql.com/get/mysql80-community-release-el9-4.noarch.rpm && \
-    rpm -Uvh mysql80-community-release-*.noarch.rpm && \
-    yum install -y mysql-community-devel mysql-community-client
+RUN yum install -y --allowerasing gcc make less git psmisc curl voms-clients-cpp wget httpd logrotate procps mod_ssl \
+    openssl-devel readline-devel bzip2-devel libffi-devel zlib-devel passwd voms-clients-java which mysql-devel mariadb
 
 # install python
 RUN mkdir /tmp/python && cd /tmp/python && \
@@ -89,6 +87,8 @@ RUN chmod -R 777 /etc/grid-security/certificates
 RUN chmod -R 777 /data/harvester
 RUN chmod -R 777 /data/condor
 RUN chmod -R 777 /etc/httpd
+RUN chmod -R 777 /etc/vomses
+RUN chmod -R 777 /etc/grid-security/vomsdir/lsst
 RUN chmod -R 777 /var/log/httpd
 RUN chmod -R 777 /var/lib/logrotate
 RUN mkdir -p /opt/harvester/etc/queue_config && chmod 777 /opt/harvester/etc/queue_config

diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py
@@ -24,6 +24,10 @@ def __init__(self, queue_config_mapper):
             self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
         except AttributeError:
             self.maxNewWorkers = None
+        try:
+            self.activate_worker_factor = float(harvester_config.submitter.activateWorkerFactor)
+        except AttributeError:
+            self.activate_worker_factor = 1.0
 
     # define number of workers to submit based on various information
     def define_num_workers(self, static_num_workers, site_name):
@@ -145,11 +149,11 @@ def define_num_workers(self, static_num_workers, site_name):
                                 else:
                                     # limit the queue to the number of activated jobs to avoid empty pilots
                                     try:
-                                        n_activated = max(job_stats[queue_name]["activated"], 1)  # avoid no activity queues
+                                        n_activated = max(job_stats[queue_name]["activated"] * self.activate_worker_factor, 1)  # avoid no activity queues
                                     except KeyError:
                                         # zero job in the queue
                                         tmp_log.debug("no job in queue")
-                                        n_activated = 1
+                                        n_activated = max(1 - n_queue - n_ready - n_running, 0)
                                     finally:
                                         queue_limit = max_queued_workers
                                         max_queued_workers = min(n_activated, max_queued_workers)

diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py
@@ -174,6 +174,7 @@ def make_a_jdl(
     prod_rc_permille=0,
     token_dir=None,
     is_gpu_resource=False,
+    n_core_factor=1,
     **kwarg,
 ):
     # make logger
@@ -206,8 +207,9 @@ def make_a_jdl(
                 n_core_total = int(_match.group(1))
             tmpLog.debug(f"job attributes override by CRIC special_par: {attr}={str(_match.group(1))}")
     # derived job attributes
+    n_core_total = n_core_total * n_core_factor
     n_node = ceil(n_core_total / n_core_per_node)
-    request_ram_bytes = request_ram * 2**20
+    request_ram_bytes = request_ram * 2**20 * n_core_factor
     request_ram_per_core = ceil(request_ram * n_node / n_core_total)
     request_ram_bytes_per_core = ceil(request_ram_bytes * n_node / n_core_total)
     request_cputime = request_walltime * n_core_total
@@ -243,6 +245,7 @@ def make_a_jdl(
         "nCorePerNode": n_core_per_node,
         "nCoreTotal": n_core_total,
         "nNode": n_node,
+        "nCoreFactor": n_core_factor,
         "requestRam": request_ram,
         "requestRamBytes": request_ram_bytes,
         "requestRamPerCore": request_ram_per_core,
@@ -338,6 +341,15 @@ def __init__(self, **kwarg):
         else:
             if (not self.nProcesses) or (self.nProcesses < 1):
                 self.nProcesses = 1
+        # ncore factor
+        try:
+            self.nCoreFactor = int(self.nCoreFactor)
+        except AttributeError:
+            self.nCoreFactor = 1
+        else:
+            self.nCoreFactor = int(self.nCoreFactor)
+            if (not self.nCoreFactor) or (self.nCoreFactor < 1):
+                self.nCoreFactor = 1
         # executable file
         try:
             self.executableFile
@@ -824,6 +836,7 @@ def _choose_credential(workspec):
                         "log_dir": self.logDir,
                         "log_subdir": log_subdir,
                         "n_core_per_node": n_core_per_node,
+                        "n_core_factor": self.nCoreFactor,
                         "panda_queue_name": panda_queue_name,
                         "x509_user_proxy": proxy,
                         "ce_info_dict": ce_info_dict,

diff --git a/pandaharvester/harvestersubmitter/slurm_submitter.py b/pandaharvester/harvestersubmitter/slurm_submitter.py
@@ -29,6 +29,15 @@ def __init__(self, **kwarg):
         PluginBase.__init__(self, **kwarg)
         if not hasattr(self, "localQueueName"):
             self.localQueueName = "grid"
+        # ncore factor
+        try:
+            self.nCoreFactor = int(self.nCoreFactor)
+        except AttributeError:
+            self.nCoreFactor = 1
+        else:
+            self.nCoreFactor = int(self.nCoreFactor)
+            if (not self.nCoreFactor) or (self.nCoreFactor < 1):
+                self.nCoreFactor = 1
 
     # submit workers
     def submit_workers(self, workspec_list):
@@ -91,12 +100,13 @@ def make_placeholder_map(self, workspec):
             n_core_per_node = n_core_per_node_from_queue
 
         n_core_total = workspec.nCore if workspec.nCore else n_core_per_node
+        n_core_total = n_core_total * self.nCoreFactor
         request_ram = max(workspec.minRamCount, 1 * n_core_total) if workspec.minRamCount else 1 * n_core_total
         request_disk = workspec.maxDiskCount * 1024 if workspec.maxDiskCount else 1
         request_walltime = workspec.maxWalltime if workspec.maxWalltime else 0
 
         n_node = ceil(n_core_total / n_core_per_node)
-        request_ram_bytes = request_ram * 2**20
+        request_ram_bytes = request_ram * 2**20 * self.nCoreFactor
         request_ram_per_core = ceil(request_ram * n_node / n_core_total)
         request_ram_bytes_per_core = ceil(request_ram_bytes * n_node / n_core_total)
         request_cputime = request_walltime * n_core_total
@@ -106,6 +116,7 @@ def make_placeholder_map(self, workspec):
         placeholder_map = {
             "nCorePerNode": n_core_per_node,
             "nCoreTotal": n_core_total,
+            "nCoreFactor": self.nCoreFactor,
             "nNode": n_node,
             "requestRam": request_ram,
             "requestRamBytes": request_ram_bytes,