Skip to content

Commit

Permalink
Merge pull request #204 from wguanicedew/master
Browse files Browse the repository at this point in the history
add activate and core factor
  • Loading branch information
mightqxc authored Nov 22, 2023
2 parents ec03a96 + e943301 commit fe516ec
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 11 deletions.
61 changes: 61 additions & 0 deletions .github/workflows/docker-publish-al.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Docker-alma9

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

on:
release:
types: [published]

workflow_dispatch:

env:
# Use docker.io for Docker Hub if empty
REGISTRY: ghcr.io
# github.repository as <account>/<repo>
IMAGE_NAME: ${{ github.repository }}-alma9


jobs:
build:

runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v3

# Login against a Docker registry except on PR
# https://github.com/docker/login-action
- name: Log into registry ${{ env.REGISTRY }}
if: github.event_name != 'pull_request'
uses: docker/login-action@v2
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

# Extract metadata (tags, labels) for Docker
# https://github.com/docker/metadata-action
- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

# Build and push Docker image with Buildx (don't push on PR)
# https://github.com/docker/build-push-action
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v4
with:
context: .
file: Dockerfile.al9
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
14 changes: 7 additions & 7 deletions Dockerfile.al9
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,17 @@

ARG PYTHON_VERSION=3.11.4

FROM docker.io/almalinux:9
FROM docker.io/almalinux:9.2

ARG PYTHON_VERSION

RUN yum update -y
RUN yum install -y epel-release
RUN yum install -y --allowerasing gcc make less git psmisc curl voms-clients-cpp wget httpd logrotate procps mod_ssl \
openssl-devel readline-devel bzip2-devel libffi-devel zlib-devel passwd
RUN yum install -y yum-utils
RUN yum-config-manager --enable crb

# install mysql-community for CC7+Python3.11
RUN wget https://dev.mysql.com/get/mysql80-community-release-el9-4.noarch.rpm && \
rpm -Uvh mysql80-community-release-*.noarch.rpm && \
yum install -y mysql-community-devel mysql-community-client
RUN yum install -y --allowerasing gcc make less git psmisc curl voms-clients-cpp wget httpd logrotate procps mod_ssl \
openssl-devel readline-devel bzip2-devel libffi-devel zlib-devel passwd voms-clients-java which mysql-devel mariadb

# install python
RUN mkdir /tmp/python && cd /tmp/python && \
Expand Down Expand Up @@ -89,6 +87,8 @@ RUN chmod -R 777 /etc/grid-security/certificates
RUN chmod -R 777 /data/harvester
RUN chmod -R 777 /data/condor
RUN chmod -R 777 /etc/httpd
RUN chmod -R 777 /etc/vomses
RUN chmod -R 777 /etc/grid-security/vomsdir/lsst
RUN chmod -R 777 /var/log/httpd
RUN chmod -R 777 /var/lib/logrotate
RUN mkdir -p /opt/harvester/etc/queue_config && chmod 777 /opt/harvester/etc/queue_config
Expand Down
8 changes: 6 additions & 2 deletions pandaharvester/harvesterbody/worker_adjuster.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def __init__(self, queue_config_mapper):
self.maxNewWorkers = harvester_config.submitter.maxNewWorkers
except AttributeError:
self.maxNewWorkers = None
try:
self.activate_worker_factor = float(harvester_config.submitter.activateWorkerFactor)
except AttributeError:
self.activate_worker_factor = 1.0

# define number of workers to submit based on various information
def define_num_workers(self, static_num_workers, site_name):
Expand Down Expand Up @@ -145,11 +149,11 @@ def define_num_workers(self, static_num_workers, site_name):
else:
# limit the queue to the number of activated jobs to avoid empty pilots
try:
n_activated = max(job_stats[queue_name]["activated"], 1) # avoid no activity queues
n_activated = max(job_stats[queue_name]["activated"] * self.activate_worker_factor, 1) # avoid no activity queues
except KeyError:
# zero job in the queue
tmp_log.debug("no job in queue")
n_activated = 1
n_activated = max(1 - n_queue - n_ready - n_running, 0)
finally:
queue_limit = max_queued_workers
max_queued_workers = min(n_activated, max_queued_workers)
Expand Down
15 changes: 14 additions & 1 deletion pandaharvester/harvestersubmitter/htcondor_submitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def make_a_jdl(
prod_rc_permille=0,
token_dir=None,
is_gpu_resource=False,
n_core_factor=1,
**kwarg,
):
# make logger
Expand Down Expand Up @@ -206,8 +207,9 @@ def make_a_jdl(
n_core_total = int(_match.group(1))
tmpLog.debug(f"job attributes override by CRIC special_par: {attr}={str(_match.group(1))}")
# derived job attributes
n_core_total = n_core_total * n_core_factor
n_node = ceil(n_core_total / n_core_per_node)
request_ram_bytes = request_ram * 2**20
request_ram_bytes = request_ram * 2**20 * n_core_factor
request_ram_per_core = ceil(request_ram * n_node / n_core_total)
request_ram_bytes_per_core = ceil(request_ram_bytes * n_node / n_core_total)
request_cputime = request_walltime * n_core_total
Expand Down Expand Up @@ -243,6 +245,7 @@ def make_a_jdl(
"nCorePerNode": n_core_per_node,
"nCoreTotal": n_core_total,
"nNode": n_node,
"nCoreFactor": n_core_factor,
"requestRam": request_ram,
"requestRamBytes": request_ram_bytes,
"requestRamPerCore": request_ram_per_core,
Expand Down Expand Up @@ -338,6 +341,15 @@ def __init__(self, **kwarg):
else:
if (not self.nProcesses) or (self.nProcesses < 1):
self.nProcesses = 1
# ncore factor
try:
self.nCoreFactor = int(self.nCoreFactor)
except AttributeError:
self.nCoreFactor = 1
else:
self.nCoreFactor = int(self.nCoreFactor)
if (not self.nCoreFactor) or (self.nCoreFactor < 1):
self.nCoreFactor = 1
# executable file
try:
self.executableFile
Expand Down Expand Up @@ -824,6 +836,7 @@ def _choose_credential(workspec):
"log_dir": self.logDir,
"log_subdir": log_subdir,
"n_core_per_node": n_core_per_node,
"n_core_factor": self.nCoreFactor,
"panda_queue_name": panda_queue_name,
"x509_user_proxy": proxy,
"ce_info_dict": ce_info_dict,
Expand Down
13 changes: 12 additions & 1 deletion pandaharvester/harvestersubmitter/slurm_submitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ def __init__(self, **kwarg):
PluginBase.__init__(self, **kwarg)
if not hasattr(self, "localQueueName"):
self.localQueueName = "grid"
# ncore factor
try:
self.nCoreFactor = int(self.nCoreFactor)
except AttributeError:
self.nCoreFactor = 1
else:
self.nCoreFactor = int(self.nCoreFactor)
if (not self.nCoreFactor) or (self.nCoreFactor < 1):
self.nCoreFactor = 1

# submit workers
def submit_workers(self, workspec_list):
Expand Down Expand Up @@ -91,12 +100,13 @@ def make_placeholder_map(self, workspec):
n_core_per_node = n_core_per_node_from_queue

n_core_total = workspec.nCore if workspec.nCore else n_core_per_node
n_core_total = n_core_total * self.nCoreFactor
request_ram = max(workspec.minRamCount, 1 * n_core_total) if workspec.minRamCount else 1 * n_core_total
request_disk = workspec.maxDiskCount * 1024 if workspec.maxDiskCount else 1
request_walltime = workspec.maxWalltime if workspec.maxWalltime else 0

n_node = ceil(n_core_total / n_core_per_node)
request_ram_bytes = request_ram * 2**20
request_ram_bytes = request_ram * 2**20 * self.nCoreFactor
request_ram_per_core = ceil(request_ram * n_node / n_core_total)
request_ram_bytes_per_core = ceil(request_ram_bytes * n_node / n_core_total)
request_cputime = request_walltime * n_core_total
Expand All @@ -106,6 +116,7 @@ def make_placeholder_map(self, workspec):
placeholder_map = {
"nCorePerNode": n_core_per_node,
"nCoreTotal": n_core_total,
"nCoreFactor": self.nCoreFactor,
"nNode": n_node,
"requestRam": request_ram,
"requestRamBytes": request_ram_bytes,
Expand Down

0 comments on commit fe516ec

Please sign in to comment.