From dc620712cf97b28d19b8e19b3f29019dc3accf17 Mon Sep 17 00:00:00 2001 From: Christian Otto Stelter <76177803+stelterlab@users.noreply.github.com> Date: Mon, 12 Jun 2023 08:31:12 +0200 Subject: [PATCH] refactored distribution handling adding Flatcar and Fedora support (#17) * refactoring crawler - split Ubuntu handling from updater/service.py as first to make code more readable - note: debugging code still in place Signed-off-by: Christian Otto Stelter * refactoring crawler - split Debian handling from updater/service.py - split Alma Linux handling from updater/service.py Signed-off-by: Christian Otto Stelter * Added support for crawling Flatcar Container Linux Signed-off-by: Christian Otto Stelter * - removed no longer used release_update_check from updater/service.py - added error message for unsupported distributions - updated changelog - updated README.md with supported distributions Signed-off-by: Christian Otto Stelter * - make use of loguru for simple configuration of logging - AlmaLinux crawling not yet fully functional again Signed-off-by: Christian Otto Stelter * - AlmaLinux crawling improved - it does not simply fetches the first hit - and working again - extended requirements.txt for loguru Signed-off-by: Christian Otto Stelter * - added Debian 12 aka bookworm to image-sources.yaml - fixed debug output for checksums Signed-off-by: Christian Otto Stelter * - added first try on Fedora - template not yet finished - last checksum query must be adjusted for distributions like Fedora - exporter must be adjusted Signed-off-by: Christian Otto Stelter * - adjusted checksum query for Fedora - could be more generic for Distributions with no "minor" release updates - adjusted database queries - get distribution_version needed for Fedora - updated template for export Signed-off-by: Christian Otto Stelter * - added --debug as argument - added first try on Fedora Linux Support to crawler - added Debian Linux 12 aka bookworm to sample image-sources.yaml Signed-off-by: Christian Otto Stelter * - removed old branch from Dockerfile Signed-off-by: Christian Otto Stelter --------- Signed-off-by: Christian Otto Stelter --- CHANGELOG.md | 10 ++ README.md | 15 ++- crawler/core/config.py | 8 +- crawler/core/database.py | 85 ++++++++++++----- crawler/core/exporter.py | 20 ++-- crawler/core/main.py | 3 +- crawler/updater/alma.py | 129 ++++++++++++++++++++++++++ crawler/updater/debian.py | 180 ++++++++++++++++++++++++++++++++++++ crawler/updater/fedora.py | 156 ++++++++++++++++++++++++++++++++ crawler/updater/flatcar.py | 117 ++++++++++++++++++++++++ crawler/updater/service.py | 103 ++++++++------------- crawler/updater/ubuntu.py | 181 +++++++++++++++++++++++++++++++++++++ docker/Dockerfile | 2 +- etc/image-sources.yaml | 30 +++++- image-crawler.py | 59 +++++++++--- requirements.txt | 1 + templates/fedora.yml.j2 | 32 +++++++ templates/flatcar.yml.j2 | 2 +- 18 files changed, 1013 insertions(+), 120 deletions(-) create mode 100644 crawler/updater/alma.py create mode 100644 crawler/updater/debian.py create mode 100644 crawler/updater/fedora.py create mode 100644 crawler/updater/flatcar.py create mode 100644 crawler/updater/ubuntu.py create mode 100644 templates/fedora.yml.j2 diff --git a/CHANGELOG.md b/CHANGELOG.md index c8bd0fb..abffb9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## 2023-06-11 +- refactoring of updater parts - split into single files for each distribution +- removed generic release_update_check from updater/service.py +- added support Flatcar Container Linux +- updated README.md +- added loguru for easy configuration of logging +- added optional debug output (via loglevel) +- added --debug as argument +- added first try on Fedora Linux Support to crawler +- added Debian Linux 12 aka bookworm to sample image-sources.yaml ## 2023-06-01 - updated example Dockerfile to new repos diff --git a/README.md b/README.md index ae9eb44..7d5eca5 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,16 @@ OpenStack Image Crawler for checking image sources, gathering update information and generating image catalog files for the [OpenStack Image Manager](https://github.com/osism/openstack-image-manager) (or similiar tools). +Supported distributions: + +- Ubuntu Linux +- Debian Linux +- AlmaLinux +- Flatcar Container Linux +- Fedora Linux + +Note: Flatcar Container Linux offers only zipped images, so a direct upload via OpenStack Image Manager/Glance is not supported (yet). + ## Requirements ### Git repository for holding the image catalogs (optional) @@ -13,7 +23,7 @@ If there is no remote_repository entry in the config, the git actions are disabl ## Installation -Tested on Ubuntu 20.04 LTS + 22.04 LTS (should work with Python 3.8+ on other OSs, too) +Tested on Ubuntu 20.04 LTS + 22.04 LTS. Should work with Python 3.8+ on other OSs, too. Optional: build the docker container. ``` sudo apt install git python3-pip python3-venv @@ -43,7 +53,7 @@ Usage: ``` ./image-crawler.py -h -plusserver Image Crawler v0.1 +plusserver Image Crawler v0.4.0 usage: image-crawler.py [-h] [--config CONFIG] [--sources SOURCES] [--init-db] [--export-only] [--updates-only] @@ -56,6 +66,7 @@ optional arguments: --init-db initialize image catalog database --export-only export only existing image catalog --updates-only check only for updates, do not export catalog + --debug give more output for debugging ``` ### Helper: Historian diff --git a/crawler/core/config.py b/crawler/core/config.py index 9b3c516..b20a441 100644 --- a/crawler/core/config.py +++ b/crawler/core/config.py @@ -1,4 +1,6 @@ import yaml + +from loguru import logger from pathlib import Path @@ -10,12 +12,12 @@ def config_read(name, msg="config"): try: config = yaml.safe_load(Path(name).read_text()) except PermissionError: - print("ERROR: could not open config - please check file permissions") + logger.error("could not open config - please check file permissions") return None except yaml.YAMLError as error: - print("ERROR: %s" % error) + logger.error(error) return None - print("Successfully read %s from %s" % (msg, name)) + logger.info("Successfully read %s from %s" % (msg, name)) return config diff --git a/crawler/core/database.py b/crawler/core/database.py index 6de9538..4db7175 100644 --- a/crawler/core/database.py +++ b/crawler/core/database.py @@ -1,5 +1,7 @@ import sys import sqlite3 + +from loguru import logger from pathlib import Path @@ -9,7 +11,7 @@ def database_connect(name): try: connection = sqlite3.connect(name) except sqlite3.OperationalError as error: - print("ERROR: %s" % error) + logger.error(error) return None return connection else: @@ -23,7 +25,7 @@ def database_disconnect(connection): def database_initialize(name, prog_dirname): path = Path(name) if path.is_file(): - print("WARNING: database %s already exists. Cowardly refusing action." % name) + logger.warning("database %s already exists. Cowardly refusing action." % name) else: create_statement_fqfn = prog_dirname + "/lib/initialize-image-catalog.sql" create_statement_file_path = Path(create_statement_fqfn) @@ -32,21 +34,45 @@ def database_initialize(name, prog_dirname): create_statement = db_init_file.read() db_init_file.close() else: - raise SystemError("Template initialize-image-catalog.sql not found") + logger.error("Template initialize-image-catalog.sql not found") + raise SystemExit(1) try: connection = sqlite3.connect(name) except sqlite3.OperationalError as error: - print("ERROR: %s" % error) + logger.error(error) database_cursor = connection.cursor() try: database_cursor.execute(create_statement) except Exception as error: - print('ERROR: create table failed with the following error "%s"' % error) + logger.error('create table failed with the following error "%s"' % error) connection.close() - print("New database created under %s" % name) + logger.info("New database created under %s" % name) + +def db_get_last_checksum_fedora(connection, distribution): + try: + database_cursor = connection.cursor() + database_cursor.execute( + "SELECT checksum FROM image_catalog " + "WHERE distribution_name = '%s' " + "ORDER BY id DESC LIMIT 1" % distribution + ) + except sqlite3.OperationalError as error: + logger.error(error) + raise SystemExit(1) + + row = database_cursor.fetchone() + + if row is None: + logger.debug("no previous entries found") + last_checksum = "sha256:none" + else: + last_checksum = row[0] + database_cursor.close() + + return last_checksum def db_get_last_checksum(connection, distribution, release): try: @@ -58,12 +84,13 @@ def db_get_last_checksum(connection, distribution, release): "ORDER BY id DESC LIMIT 1" % (distribution, release) ) except sqlite3.OperationalError as error: - raise SystemError("SQLite error: %s" % error) + logger.error(error) + raise SystemExit(1) row = database_cursor.fetchone() if row is None: - # print("no previous entries found") + logger.debug("no previous entries found") last_checksum = "sha256:none" else: last_checksum = row[0] @@ -87,7 +114,7 @@ def db_get_release_versions(connection, distribution, release, limit): "ORDER BY id DESC LIMIT %d" % (distribution, release, limit) ) except sqlite3.OperationalError as error: - print("SQLite error: %s" % error) + logger.error(error) sys.exit(1) row = database_cursor.fetchone() @@ -119,7 +146,8 @@ def read_version_from_catalog(connection, distribution, release, version): "ORDER BY ID" % (distribution, release, version) ) except sqlite3.OperationalError as error: - raise SystemError("SQLite error: %s" % error) + logger.error(error) + raise SystemExit(1) image_catalog = {} image_catalog["versions"] = {} @@ -154,7 +182,8 @@ def write_catalog_entry(connection, update): ) connection.commit() except sqlite3.OperationalError as error: - raise SystemError("SQLite error: %s" % error) + logger.error(error) + raise SystemExit(1) database_cursor.close() @@ -170,7 +199,8 @@ def update_catalog_entry(connection, update): ) connection.commit() except sqlite3.OperationalError as error: - raise SystemError("SQLite error: %s" % error) + logger.error(error) + raise SystemExit(1) database_cursor.close() @@ -186,7 +216,7 @@ def write_or_update_catalog_entry(connection, update): ) if update["version"] in existing_entry["versions"]: - print("Updating version " + update["version"]) + logger.info("Updating version " + update["version"]) return update_catalog_entry(connection, update) else: return write_catalog_entry(connection, update) @@ -195,16 +225,26 @@ def write_or_update_catalog_entry(connection, update): def read_release_from_catalog(connection, distribution, release, limit): try: database_cursor = connection.cursor() - database_cursor.execute( - "SELECT version,checksum,url,release_date " - "FROM (SELECT * FROM image_catalog " - "WHERE distribution_name = '%s' " - "AND distribution_release = '%s' " - "ORDER BY id DESC LIMIT %d) " - "ORDER BY ID" % (distribution, release, limit) - ) + if release == "all": + database_cursor.execute( + "SELECT version,checksum,url,release_date,distribution_release " + "FROM (SELECT * FROM image_catalog " + "WHERE distribution_name = '%s' " + "ORDER BY id DESC LIMIT %d) " + "ORDER BY ID" % (distribution, limit) + ) + else: + database_cursor.execute( + "SELECT version,checksum,url,release_date,distribution_release " + "FROM (SELECT * FROM image_catalog " + "WHERE distribution_name = '%s' " + "AND distribution_release = '%s' " + "ORDER BY id DESC LIMIT %d) " + "ORDER BY ID" % (distribution, release, limit) + ) except sqlite3.OperationalError as error: - raise SystemError("SQLite error: %s" % error) + logger.error(error) + raise SystemExit(1) image_catalog = {} image_catalog["versions"] = {} @@ -215,5 +255,6 @@ def read_release_from_catalog(connection, distribution, release, limit): image_catalog["versions"][version]["checksum"] = image[1] image_catalog["versions"][version]["url"] = image[2] image_catalog["versions"][version]["release_date"] = image[3] + image_catalog["versions"][version]["distribution_release"] = image[4] return image_catalog diff --git a/crawler/core/exporter.py b/crawler/core/exporter.py index faa94a1..a62989c 100644 --- a/crawler/core/exporter.py +++ b/crawler/core/exporter.py @@ -2,6 +2,7 @@ import os from crawler.core.database import read_release_from_catalog +from loguru import logger def export_image_catalog( @@ -11,18 +12,19 @@ def export_image_catalog( # create directory (once) - only necessary when not created by git clone if not os.path.exists(local_repository): try: - print("Creating repository directory (%s)" % local_repository) + logger.info("Creating repository directory (%s)" % local_repository) os.makedirs(local_repository) except os.error as error: - raise SystemExit( - "FATAL: Creating directory %s failed with %s" + logger.error( + "Creating directory %s failed with %s" % (local_repository, error) ) + raise SystemExit(1) for source in sources_catalog["sources"]: if source["name"] in updated_sources: distribution = source["name"] - print("Exporting image catalog for " + distribution) + logger.info("Exporting image catalog for " + distribution) header_file = open(template_path + "/header.yml") catalog_export = header_file.read() header_file.close() @@ -44,6 +46,7 @@ def export_image_catalog( limit = release["limit"] else: limit = 3 + release_catalog = read_release_from_catalog( connection, distribution, release["name"], limit ) @@ -74,17 +77,18 @@ def export_image_catalog_all( # create directory (once) - only necessary when not created by git clone if not os.path.exists(local_repository): try: - print("Creating repository directory (%s)" % local_repository) + logger.info("Creating repository directory (%s)" % local_repository) os.makedirs(local_repository) except os.error as error: - raise SystemExit( - "FATAL: Creating directory %s failed with %s" + logger.error( + "Creating directory %s failed with %s" % (local_repository, error) ) + raise SystemExit(1) for source in sources_catalog["sources"]: distribution = source["name"] - print("Exporting image catalog for " + distribution) + logger.info("Exporting image catalog for " + distribution) header_file = open(template_path + "/header.yml") catalog_export = header_file.read() header_file.close() diff --git a/crawler/core/main.py b/crawler/core/main.py index ccc594d..22fd771 100644 --- a/crawler/core/main.py +++ b/crawler/core/main.py @@ -1,11 +1,12 @@ from crawler.updater.service import image_update_service +from loguru import logger def crawl_image_sources(image_source_catalog, database): updated_sources = {} for source in image_source_catalog["sources"]: - print("\nChecking updates for Distribution " + source["name"]) + logger.info("Checking updates for Distribution " + source["name"]) updated_releases = image_update_service(database, source) if updated_releases: updated_sources[source["name"]] = {} diff --git a/crawler/updater/alma.py b/crawler/updater/alma.py new file mode 100644 index 0000000..3bfa9f1 --- /dev/null +++ b/crawler/updater/alma.py @@ -0,0 +1,129 @@ +# alma.py +# +# crawl distribution Alma Linux + +import requests +import re + +from crawler.web.generic import url_get_last_modified +from crawler.web.directory import web_get_checksum, web_get_current_image_metadata + +from bs4 import BeautifulSoup +from loguru import logger + + +def build_image_url(release, imagefile_name): + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + # if not versionpath.endswith("/"): + # versionpath = versionpath + "/" + + return ( + base_url + release["releasepath"] + "/" + imagefile_name + ) + +def get_metadata(release, image_filedate): + filedate = image_filedate.replace("-", "") + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + requestURL = release["baseURL"] + release["releasepath"] + # TODO: make this configurable in image-source.yaml + # + # group(1) contains major version as 9 + # group(2) contains full version with minor version number as 9.2 + # group(3) contains release date as 20230513 + # ex. AlmaLinux-9-GenericCloud-9.2-20230513.x86_64.qcow2 + + filename_pattern = re.compile(r"AlmaLinux-(\d+)-GenericCloud-(\d+.\d+)-(\d+).x86_64") + + logger.debug("request_URL: " + requestURL) + + request = requests.get(requestURL, allow_redirects=True) + soup = BeautifulSoup(request.text, "html.parser") + + for link in soup.find_all("a"): + data = link.get("href") + logger.debug("data: " + data) + + if filename_pattern.search(data): + logger.debug("pattern matched for " + data) + extract = filename_pattern.search(data) + release_date = extract.group(3) + version = release_date + + logger.debug("url: " + build_image_url(release, data)) + logger.debug("last version: " + version) + logger.debug("release_date: " + image_filedate) + + return { + "url": build_image_url(release, data), + "version": version, + "release_date": image_filedate, + } + + return None + +def alma_update_check(release, last_checksum): + # as specified in image-sources.yaml + # baseURL: https://repo.almalinux.org/almalinux/9/ + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + checksum_url = base_url + release["releasepath"] + "/" + release["checksumname"] + + logger.debug("checksum_url: " + checksum_url) + + # as specified in image-sources.yaml + # imagename: AlmaLinux-9-GenericCloud-latest.x86_64 + # extension: qcow2 + imagename = release["imagename"] + "." + release["extension"] + + logger.debug("imagename: " + imagename) + + current_checksum = web_get_checksum(checksum_url, imagename) + + if current_checksum is None: + logger.error( + "no matching checksum found - check image (%s) " + "and checksum filename (%s)" % (imagename, release["checksumname"]) + ) + return None + + logger.debug("current_checksum: " + current_checksum) + + # as specified in image-sources.yaml + # algorithm: sha256 + current_checksum = release["algorithm"] + ":" + current_checksum + + if current_checksum != last_checksum: + logger.debug("current_checksum " + current_checksum + " differs from last_checksum " + last_checksum) + image_url = base_url + release["releasepath"] + "/" + imagename + + logger.debug("image_url:" + image_url) + + image_filedate = url_get_last_modified(image_url) + + logger.debug("image_filedate:" + image_filedate) + + image_metadata = get_metadata(release, image_filedate) + if image_metadata is not None: + logger.debug("got metadata") + update = {} + update["release_date"] = image_metadata["release_date"] + update["url"] = image_metadata["url"] + update["version"] = image_metadata["version"] + update["checksum"] = current_checksum + return update + else: + logger.warning("got no metadata") + return None + + return None diff --git a/crawler/updater/debian.py b/crawler/updater/debian.py new file mode 100644 index 0000000..6c87098 --- /dev/null +++ b/crawler/updater/debian.py @@ -0,0 +1,180 @@ +# debian.py +# +# crawl distribution Debian + +import requests +import datetime + +from crawler.web.generic import url_get_last_modified +from crawler.web.directory import web_get_checksum, web_get_current_image_metadata + +from bs4 import BeautifulSoup +from loguru import logger + + +def build_image_url(release, versionpath): + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + if not versionpath.endswith("/"): + versionpath = versionpath + "/" + + return ( + base_url + + versionpath + + release["imagename"] + + "-" + + version_from_path(versionpath) + + "." + + release["extension"] + ) + +def release_date_from_version(release_version): + # 20230601-1398 + release_date = ( + release_version[0:4] + + "-" + + release_version[4:6] + + "-" + + release_version[6:8] + ) + return release_date + +def version_from_path(versionpath): + # the path within the releases directory has the format + # 20230601-1398/ + + if versionpath.endswith("/"): + versionpath = versionpath.rstrip("/") + + return versionpath + +def get_metadata(release, image_filedate): + filedate = image_filedate.replace("-", "") + requestURL = release["baseURL"] + + request = requests.get(requestURL, allow_redirects=True) + soup = BeautifulSoup(request.text, "html.parser") + + for link in soup.find_all("a"): + data = link.get("href") + if data.find(filedate) != -1: + release_version_path = data + version = version_from_path(release_version_path) + if version is None: + return None + + release_date = release_date_from_version(version) + if release_date is None: + return None + + logger.debug("url: " + build_image_url(release, release_version_path)) + logger.debug("last version: " + version) + logger.debug("release_date: " + release_date) + + return { + "url": build_image_url( + release, release_version_path + ), + "version": version, + "release_date": release_date, + } + + # release is behind file date + search_date = datetime.date(int(filedate[0:4]), int(filedate[4:6]), int(filedate[6:8])) + + max_days_back = 6 + days_back = 1 + + while days_back <= max_days_back: + search_date = search_date - datetime.timedelta(days=1) + version = search_date.strftime("%Y%m%d") + + for link in soup.find_all("a"): + data = link.get("href") + if data.find(version) != -1: + release_version_path = data + version = version_from_path(release_version_path) + if version is None: + return None + + release_date = release_date_from_version(version) + if release_date is None: + return None + + logger.debug("url: " + build_image_url(release, release_version_path)) + logger.debug("last version: " + version) + logger.debug("release_date: " + release_date) + + return { + "url": build_image_url( + release, release_version_path + ), + "version": version, + "release_date": release_date, + } + + days_back = days_back + 1 + + return None + +def debian_update_check(release, last_checksum): + # as specified in image-sources.yaml + # baseURL: https://cloud.debian.org/images/cloud/bullseye/ + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + checksum_url = base_url + release["releasepath"] + "/" + release["checksumname"] + + logger.debug("checksum_url: " + checksum_url) + + # as specified in image-sources.yaml + # imagename: debian-11-genericcloud-amd64 + # extension: qcow2 + imagename = release["imagename"] + "." + release["extension"] + + logger.debug("imagename: " + imagename) + + current_checksum = web_get_checksum(checksum_url, imagename) + + if current_checksum is None: + logger.error( + "no matching checksum found - check image (%s) " + "and checksum filename (%s)" % (imagename, release["checksumname"]) + ) + return None + + logger.debug("current_checksum: " + current_checksum) + + # as specified in image-sources.yaml + # algorithm: sha512 + current_checksum = release["algorithm"] + ":" + current_checksum + + if current_checksum != last_checksum: + logger.debug("current_checksum " + current_checksum + " differs from last_checksum " + last_checksum) + image_url = base_url + release["releasepath"] + "/" + imagename + + logger.debug("image_url: " + image_url) + + image_filedate = url_get_last_modified(image_url) + + logger.debug("image_filedate: " + image_filedate) + + image_metadata = get_metadata(release, image_filedate) + if image_metadata is not None: + logger.debug("got metadata") + update = {} + update["release_date"] = image_metadata["release_date"] + update["url"] = image_metadata["url"] + update["version"] = image_metadata["version"] + update["checksum"] = current_checksum + return update + else: + logger.warn("got no metadata") + return None + + return None diff --git a/crawler/updater/fedora.py b/crawler/updater/fedora.py new file mode 100644 index 0000000..a69a543 --- /dev/null +++ b/crawler/updater/fedora.py @@ -0,0 +1,156 @@ +# fedora.py +# +# crawl distribution Fedora Linux + +import requests +import re + +from crawler.web.generic import url_get_last_modified, url_fetch_content + +from bs4 import BeautifulSoup +from loguru import logger + + +def get_latest_release(release): + # as specified in image-sources.yaml + # baseURL: https://ftp.plusline.net/fedora/linux/releases/ + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + release_pattern = re.compile(r"(\d+)") + releases_path = base_url + "/" + release["releasepath"] + + request = requests.get(releases_path, allow_redirects=True) + soup = BeautifulSoup(request.text, "html.parser") + + last_link = "" + + # we need the last link matching release_pattern + for link in soup.find_all("a"): + data = link.get("href") + + if release_pattern.search(data): + last_link = data + + # last_link contains "38/" relative link address with ending slash + extract = release_pattern.search(last_link) + release_id = extract.group(0) + + logger.debug("release_id: " + release_id) + + # ToDo - in case of no last_link found we need an None + return release_id + +def get_image_filename(release, images_url): + request = requests.get(images_url, allow_redirects=True) + soup = BeautifulSoup(request.text, "html.parser") + + last_link = "" + + for link in soup.find_all("a"): + data = link.get("href") + + if release["extension"] in data: + # logger.debug("match: " + data) + last_link = data + + if len(last_link) > 0: + return last_link + else: + return None + +def get_checksum(release, images_url, images_filename): + request = requests.get(images_url, allow_redirects=True) + soup = BeautifulSoup(request.text, "html.parser") + + last_link = "" + + # we need the last link matching CHECKSUM + for link in soup.find_all("a"): + data = link.get("href") + + if release["checksumname"] in data: + last_link = data + + logger.debug("last_link: " + last_link) + + checksum_url = images_url + "/" + last_link + + checksum_list = url_fetch_content(checksum_url) + if checksum_list is None: + return None + + for line in checksum_list.splitlines(): + # logger.debug("line: " + line) + + # skip comment starting with hash + if re.match('^#', line): + continue + if images_filename in line: + # logger.debug("matched: " + line) + (filename, new_checksum) = line.split(" = ") + # logger.debug("new_checksum: " + new_checksum) + + return new_checksum + + return None + +def fedora_update_check(release, last_checksum): + # as specified in image-sources.yaml + # baseURL: https://ftp.plusline.net/fedora/linux/releases/ + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + release_id = get_latest_release(release) + + images_url = base_url + release["releasepath"] + "/" + release_id + "/" + release["imagepath"] + + image_filename = get_image_filename(release, images_url) + + if image_filename is None: + logger.warn("did not find any matching filenames") + return None + + logger.debug("image_filename: " + image_filename) + + logger.debug("checksum_path: " + images_url) + + current_checksum = get_checksum(release, images_url, image_filename) + + if current_checksum is None: + logger.error( + "no matching checksum found - check image (%s) " + "and checksum filename (%s)" % (image_filename, release["checksumname"]) + ) + return None + + logger.debug("current_checksum: " + current_checksum) + + # as specified in image-sources.yaml + # algorithm: sha256 + current_checksum = release["algorithm"] + ":" + current_checksum + + if current_checksum != last_checksum: + logger.debug("current_checksum " + current_checksum + " differs from last_checksum " + last_checksum) + + image_url = images_url + "/" + image_filename + + logger.debug("image_url: " + image_url) + + image_filedate = url_get_last_modified(image_url) + + logger.debug("image_filedate: " + image_filedate) + + update = {} + update["release_date"] = image_filedate + update["url"] = image_url + update["version"] = image_filedate.replace("-", "") + update["checksum"] = current_checksum + update["release_id"] = release_id + + return update + + return None diff --git a/crawler/updater/flatcar.py b/crawler/updater/flatcar.py new file mode 100644 index 0000000..a3c7cc9 --- /dev/null +++ b/crawler/updater/flatcar.py @@ -0,0 +1,117 @@ +# flatcar.py +# +# crawl distribution Flatcar Container Linux + +import requests +import re + +from crawler.web.generic import url_get_last_modified +from crawler.web.directory import web_get_checksum + +from bs4 import BeautifulSoup +from loguru import logger + + +def build_image_url(release, versionpath): + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + if not versionpath.endswith("/"): + versionpath = versionpath + "/" + + return ( + base_url + versionpath + release["imagename"] + "." + release["extension"] + ) + +def get_metadata(release, image_filedate): + filedate = image_filedate.replace("-", "") + requestURL = release["baseURL"] + # version format is 3510.2.2 + version_pattern = re.compile(r"\d+\.\d+\.\d+") + + request = requests.get(requestURL, allow_redirects=True) + soup = BeautifulSoup(request.text, "html.parser") + + last_link = "" + + # we need the last link matching version_pattern + for link in soup.find_all("a"): + data = link.get("href") + + if version_pattern.search(data): + last_link = data + + # last_link contains "./3510.2.2/" relative link address with dot and slashes + extract = version_pattern.search(last_link) + version = extract.group(0) + + logger.debug("url: " + build_image_url(release, version)) + logger.debug("last version: " + version) + logger.debug("release_date: " + filedate) + + return { + "url": build_image_url(release, version), + "version": version, + "release_date": filedate, + } + +def flatcar_update_check(release, last_checksum): + # as specified in image-sources.yaml + # baseURL: https://cloud-images.flatcar.com/releases/jammy/ + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + checksum_url = base_url + release["releasepath"] + "/" + release["checksumname"] + + logger.debug("checksum_url: " + checksum_url) + + # as specified in image-sources.yaml + # imagename: flatcar-22.04-server-cloudimg-amd64 + # extension: img + imagename = release["imagename"] + "." + release["extension"] + + logger.debug("imagename: " + imagename) + + current_checksum = web_get_checksum(checksum_url, imagename) + + if current_checksum is None: + logger.error( + "no matching checksum found - check image (%s) " + "and checksum filename (%s)" % (imagename, release["checksumname"]) + ) + return None + + logger.debug("current_checksum: " + current_checksum) + + # as specified in image-sources.yaml + # algorithm: sha256 + current_checksum = release["algorithm"] + ":" + current_checksum + + if current_checksum != last_checksum: + logger.debug("current_checksum " + current_checksum + " differs from last_checksum " + last_checksum) + image_url = base_url + release["releasepath"] + "/" + imagename + + logger.debug("image_url: " + image_url) + + image_filedate = url_get_last_modified(image_url) + + logger.debug("image_filedate: " + image_filedate) + + image_metadata = get_metadata(release, image_filedate) + if image_metadata is not None: + logger.debug("got metadata") + update = {} + update["release_date"] = image_metadata["release_date"] + update["url"] = image_metadata["url"] + update["version"] = image_metadata["version"] + update["checksum"] = current_checksum + return update + else: + logger.warn("got no metadata") + return None + + return None diff --git a/crawler/updater/service.py b/crawler/updater/service.py index dddc9ee..a1077eb 100644 --- a/crawler/updater/service.py +++ b/crawler/updater/service.py @@ -1,79 +1,56 @@ -from crawler.core.database import db_get_last_checksum, write_or_update_catalog_entry -from crawler.web.generic import url_get_last_modified -from crawler.web.directory import web_get_checksum, web_get_current_image_metadata +from loguru import logger - -def release_update_check(release, last_checksum): - # works for Ubuntu, Debian - if not release["baseURL"].endswith("/"): - base_url = release["baseURL"] + "/" - else: - base_url = release["baseURL"] - - # check on leading an trialing slash / for release path ? - checksum_url = base_url + release["releasepath"] + "/" + release["checksumname"] - # works for Ubuntu, Debian - # imagename _with_ proper extension to look for in checksum lists - imagename = release["imagename"] + "." + release["extension"] - - current_checksum = web_get_checksum(checksum_url, imagename) - if current_checksum is None: - print( - "ERROR: no matching checksum found - check image (%s) " - "and checksum filename (%s)" % (imagename, release["checksumname"]) - ) - return None - - current_checksum = release["algorithm"] + ":" + current_checksum - - if current_checksum != last_checksum: - image_url = base_url + release["releasepath"] + "/" + imagename - - image_filedate = url_get_last_modified(image_url) - - if "immutable" in release and release["immutable"]: - update = {} - update["release_date"] = image_filedate - update["url"] = image_url - update["version"] = release['name'] - update["checksum"] = current_checksum - - return update - - image_metadata = web_get_current_image_metadata(release, image_filedate) - if image_metadata is not None: - - update = {} - update["release_date"] = image_metadata["release_date"] - update["url"] = image_metadata["url"] - update["version"] = image_metadata["version"] - update["checksum"] = current_checksum - return update - else: - return None - - return None +from crawler.core.database import db_get_last_checksum, write_or_update_catalog_entry, db_get_last_checksum_fedora +from crawler.updater.ubuntu import ubuntu_update_check +from crawler.updater.debian import debian_update_check +from crawler.updater.alma import alma_update_check +from crawler.updater.flatcar import flatcar_update_check +from crawler.updater.fedora import fedora_update_check def image_update_service(connection, source): updated_releases = [] for release in source["releases"]: - last_checksum = db_get_last_checksum( - connection, source["name"], release["name"] - ) - catalog_update = release_update_check(release, last_checksum) + if "Fedora" in release["imagename"]: + last_checksum = db_get_last_checksum_fedora( + connection, source["name"] + ) + else: + last_checksum = db_get_last_checksum( + connection, source["name"], release["name"] + ) + + logger.debug("last_checksum:" + last_checksum) + + if "ubuntu" in release["imagename"]: + catalog_update = ubuntu_update_check(release, last_checksum) + elif "debian" in release["imagename"]: + catalog_update = debian_update_check(release, last_checksum) + elif "Alma" in release["imagename"]: + catalog_update = alma_update_check(release, last_checksum) + elif "flatcar" in release["imagename"]: + catalog_update = flatcar_update_check(release, last_checksum) + elif "Fedora" in release["imagename"]: + catalog_update = fedora_update_check(release, last_checksum) + else: + logger.error("Unsupported distribution " + source["name"] + " - please check your images-sources.yaml") + raise SystemExit(1) if catalog_update: - print("Update found for " + source["name"] + " " + release["name"]) - print("New release " + catalog_update["version"]) + logger.info("Update found for " + source["name"] + " " + release["name"]) + logger.info("New release " + catalog_update["version"]) # catalog_update anreichern mit _allen_ Daten für die DB - catalog_update["name"] = source["name"] + " " + release["name"] catalog_update["distribution_name"] = source["name"] - catalog_update["distribution_release"] = release["name"] + if "Fedora" in release["imagename"]: + catalog_update["name"] = source["name"] + " " + catalog_update["release_id"] + catalog_update["distribution_release"] = catalog_update["release_id"] + else: + catalog_update["name"] = source["name"] + " " + release["name"] + catalog_update["distribution_release"] = release["name"] catalog_update["release"] = release["name"] write_or_update_catalog_entry(connection, catalog_update) updated_releases.append(release["name"]) else: - print("No update found for " + source["name"] + " " + release["name"]) + logger.info("No update found for " + source["name"] + " " + release["name"]) return updated_releases diff --git a/crawler/updater/ubuntu.py b/crawler/updater/ubuntu.py new file mode 100644 index 0000000..d91490d --- /dev/null +++ b/crawler/updater/ubuntu.py @@ -0,0 +1,181 @@ +# ubuntu.py +# +# crawl distribution Ubuntu + +import requests +import datetime + +from crawler.web.generic import url_get_last_modified +from crawler.web.directory import web_get_checksum, web_get_current_image_metadata + +from bs4 import BeautifulSoup +from loguru import logger + + +def build_image_url(release, versionpath): + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + if not versionpath.endswith("/"): + versionpath = versionpath + "/" + + return ( + base_url + versionpath + release["imagename"] + "." + release["extension"] + ) + +def release_date_from_version(release_version): + # 20230606 + release_date = ( + release_version[0:4] + + "-" + + release_version[4:6] + + "-" + + release_version[6:8] + ) + return release_date + +def version_from_path(versionpath): + # the path within the releases directory has the format + # release-20230606/ + # or even in some cases + # release-20221101.1/ + + if versionpath.endswith("/"): + versionpath = versionpath.rstrip("/") + # if the path ends with an .X extension, strip it + if versionpath.find(".") != -1: + parts = versionpath.split(".") + versionpath = parts[0] + + # and remove the "release-" in front of the version + return versionpath.replace("release-", "") + +def get_metadata(release, image_filedate): + filedate = image_filedate.replace("-", "") + requestURL = release["baseURL"] + + request = requests.get(requestURL, allow_redirects=True) + soup = BeautifulSoup(request.text, "html.parser") + + for link in soup.find_all("a"): + data = link.get("href") + if data.find(filedate) != -1: + release_version_path = data + version = version_from_path(release_version_path) + if version is None: + return None + + release_date = release_date_from_version(version) + if release_date is None: + return None + + logger.debug("url: " + build_image_url(release, release_version_path)) + logger.debug("last version: " + version) + logger.debug("release_date: " + release_date) + + return { + "url": build_image_url( + release, release_version_path + ), + "version": version, + "release_date": release_date, + } + + # release is behind file date + search_date = datetime.date(int(filedate[0:4]), int(filedate[4:6]), int(filedate[6:8])) + + max_days_back = 6 + days_back = 1 + + while days_back <= max_days_back: + search_date = search_date - datetime.timedelta(days=1) + version = search_date.strftime("%Y%m%d") + + for link in soup.find_all("a"): + data = link.get("href") + if data.find(version) != -1: + release_version_path = data + version = version_from_path(release_version_path) + if version is None: + return None + + release_date = release_date_from_version(version) + if release_date is None: + return None + + logger.debug("url: " + build_image_url(release, release_version_path)) + logger.debug("last version: " + version) + logger.debug("release_date: " + release_date) + + return { + "url": build_image_url( + release, release_version_path + ), + "version": version, + "release_date": release_date, + } + + days_back = days_back + 1 + + return None + +def ubuntu_update_check(release, last_checksum): + # as specified in image-sources.yaml + # baseURL: https://cloud-images.ubuntu.com/releases/jammy/ + if not release["baseURL"].endswith("/"): + base_url = release["baseURL"] + "/" + else: + base_url = release["baseURL"] + + checksum_url = base_url + release["releasepath"] + "/" + release["checksumname"] + + logger.debug("checksum_url: " + checksum_url) + + # as specified in image-sources.yaml + # imagename: ubuntu-22.04-server-cloudimg-amd64 + # extension: img + imagename = release["imagename"] + "." + release["extension"] + + logger.debug("imagename: " + imagename) + + current_checksum = web_get_checksum(checksum_url, imagename) + + if current_checksum is None: + logger.error( + "no matching checksum found - check image (%s) " + "and checksum filename (%s)" % (imagename, release["checksumname"]) + ) + return None + + logger.debug("current_checksum: " + current_checksum) + + # as specified in image-sources.yaml + # algorithm: sha256 + current_checksum = release["algorithm"] + ":" + current_checksum + + if current_checksum != last_checksum: + logger.debug("current_checksum " + current_checksum + " differs from last_checksum " + last_checksum) + image_url = base_url + release["releasepath"] + "/" + imagename + + logger.debug("image_url: " + image_url) + + image_filedate = url_get_last_modified(image_url) + + logger.debug("image_filedate: " + image_filedate) + + image_metadata = get_metadata(release, image_filedate) + if image_metadata is not None: + logger.debug("got metadata") + update = {} + update["release_date"] = image_metadata["release_date"] + update["url"] = image_metadata["url"] + update["version"] = image_metadata["version"] + update["checksum"] = current_checksum + return update + else: + logger.warn("got no metadata") + return None + + return None diff --git a/docker/Dockerfile b/docker/Dockerfile index fd56ec2..68873b0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -16,7 +16,7 @@ RUN useradd -u ${HOST_UID} -s /bin/bash -c "crawler" -d "/opt/crawler" -m crawle libcap2-bin jq lsb-release software-properties-common curl dumb-init build-essential \ python3-dev python3-pip python3-venv -RUN git clone --depth 1 -b alma_slash_fix https://github.com/pluscloudopen/openstack-image-crawler.git /tmp/crawler && \ +RUN git clone --depth 1 https://github.com/pluscloudopen/openstack-image-crawler.git /tmp/crawler && \ mv /tmp/crawler/* /opt/crawler/ && \ rm -rf /tmp/crawler && \ cd /opt/crawler && \ diff --git a/etc/image-sources.yaml b/etc/image-sources.yaml index 02d6f65..bbab148 100644 --- a/etc/image-sources.yaml +++ b/etc/image-sources.yaml @@ -60,6 +60,14 @@ sources: extension: qcow2 checksumname: SHA512SUMS algorithm: sha512 + - name: '12' + codename: bookworm + baseURL: https://cloud.debian.org/images/cloud/bookworm/ + releasepath: latest + imagename: debian-12-genericcloud-amd64 + extension: qcow2 + checksumname: SHA512SUMS + algorithm: sha512 - name: AlmaLinux vendor: "AlmaLinux OS" @@ -84,14 +92,28 @@ sources: limit: 1 - name: Flatcar - vendor: "Flatcar" + vendor: "Kinvolk" releases: - - name: 3510.2.1 - codename: 3510.2.1 + - name: 'stable' + codename: 'none' baseURL: https://stable.release.flatcar-linux.net/amd64-usr - releasepath: 3510.2.1 + releasepath: current imagename: flatcar_production_openstack_image extension: img.gz checksumname: flatcar_production_openstack_image.img.gz.DIGESTS algorithm: md5 immutable: true + + - name: Fedora + vendor: "Fedora Project" + releases: + - name: 'all' + codename: none + baseURL: https://ftp.plusline.net/fedora/linux/ + releasepath: releases + imagepath: Cloud/x86_64/images + imagename: Fedora-Cloud-Base-(\d+)-(\d+\.\d+).x86_64 + extension: qcow2 + checksumname: CHECKSUM + algorithm: sha256 + limit: 1 diff --git a/image-crawler.py b/image-crawler.py index 90c2d9a..f907faf 100755 --- a/image-crawler.py +++ b/image-crawler.py @@ -6,12 +6,13 @@ # whenever a new image is detected all relevant information needed for # maintaining an image catalog # -# 2023-05-31 v0.3.1 christian.stelter@plusserver.com +# 2023-06-11 v0.4.0 christian.stelter@plusserver.com import argparse import sys import os - +# import logging +from loguru import logger from crawler.core.config import config_read from crawler.core.database import ( database_connect, @@ -22,10 +23,13 @@ from crawler.core.main import crawl_image_sources from crawler.git.base import clone_or_pull, update_repository +# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)s %(levelname)s:%(message)s') -def main(): - print("\nplusserver Image Crawler v0.3.1\n") +# logging.basicConfig(level=logging.DEBUG) +# logger = logging.getLogger(__name__) + +def main(): working_directory = os.getcwd() program_directory = os.path.dirname(os.path.abspath(__file__)) @@ -63,8 +67,31 @@ def main(): required=False, help="check only for updates, do not export catalog", ) + parser.add_argument( + "--debug", + action="store_true", + required=False, + help="give more output for debugging", + ) args = parser.parse_args() + if args.debug: + log_level = "DEBUG" + log_format = ( + "{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | " + "{function}:{line} - {message}" + ) + else: + log_level = "INFO" + log_format = ( + "{message}" + ) + + logger.remove() + logger.add(sys.stderr, format=log_format, level=log_level, colorize=True) + + logger.info("plusserver Image Crawler v0.4.0 started") + # read configuration if args.config is not None: config_filename = args.config @@ -74,7 +101,8 @@ def main(): config = config_read(config_filename, "configuration") if config is None: - raise SystemExit("\nERROR: Unable to open config " + config_filename) + logger.error("ERROR: Unable to open config " + config_filename) + raise SystemExit(1) # read the image sources if args.sources is not None: @@ -108,28 +136,28 @@ def main(): git_ssh_command, ) else: - print("No image catalog repository configured") + logger.warning("No image catalog repository configured") # connect to database database = database_connect(config["database_name"]) if database is None: - print("\nERROR: Could not open database %s" % config["database_name"]) - print( - '\nRun "./image-crawler.py --init-db" to create a new database OR config check your etc/config.yaml' + logger.error("Could not open database %s" % config["database_name"]) + logger.error( + 'Run "./image-crawler.py --init-db" to create a new database OR config check your etc/config.yaml' ) sys.exit(1) # crawl image sources when requested if args.export_only: - print("\nSkipping repository crawling") + logger.info("Skipping repository crawling") updated_sources = {} else: - print("\nStart repository crawling") + logger.info("Start repository crawling") updated_sources = crawl_image_sources(image_source_catalog, database) # export image catalog if args.updates_only: - print("\nSkipping catalog export") + logger.info("Skipping catalog export") else: if config["local_repository"].startswith("/"): export_path = config["local_repository"] @@ -137,7 +165,7 @@ def main(): export_path = working_directory + "/" + config["local_repository"] if updated_sources: - print("\nExporting catalog to %s" % export_path) + logger.info("Exporting catalog to %s" % export_path) export_image_catalog( database, image_source_catalog, @@ -147,7 +175,7 @@ def main(): ) else: if args.export_only: - print("\nExporting all catalog files to %s" % export_path) + logger.info("Exporting all catalog files to %s" % export_path) export_image_catalog_all( database, image_source_catalog, @@ -161,10 +189,11 @@ def main(): database, config["local_repository"], updated_sources, git_ssh_command ) else: - print("No remote repository update needed.") + logger.info("No remote repository update needed.") database_disconnect(database) if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index d97f1d6..14969ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ Jinja2>=3.1.2 PyYAML>=6.0 requests>=2.28.1 validators>=0.20.0 +loguru>=0.7.0 \ No newline at end of file diff --git a/templates/fedora.yml.j2 b/templates/fedora.yml.j2 new file mode 100644 index 0000000..51baa32 --- /dev/null +++ b/templates/fedora.yml.j2 @@ -0,0 +1,32 @@ +{% for release_version in catalog['versions'] %} + - name: {{ catalog['name'] }} {{ catalog['versions'][release_version]['distribution_release'] }} + format: qcow2 + login: fedora + min_disk: 4 + min_ram: 512 + status: active + visibility: public + multi: true + meta: + architecture: x86_64 + hypervisor_type: qemu + hw_disk_bus: scsi + hw_rng_model: virtio + hw_scsi_model: virtio-scsi + hw_qemu_guest_agent: yes + hw_watchdog_action: reset + replace_frequency: quarterly + hotfix_hours: 0 + uuid_validity: last-1 + provided_until: none + os_distro: fedora + os_version: '{{ catalog['versions'][release_version]['distribution_release'] }}' + tags: [] + versions: + - version: '{{ release_version }}' + url: {{ catalog['versions'][release_version]['url'] }} + checksum: {{ catalog['versions'][release_version]['checksum'] }} + build_date: {{ catalog['versions'][release_version]['release_date'] }} + image_source: {{ catalog['versions'][release_version]['url'] }} + image_description: https://docs.fedoraproject.org/en-US/fedora/latest/release-notes/ +{%- endfor %} diff --git a/templates/flatcar.yml.j2 b/templates/flatcar.yml.j2 index 68a1b65..3284090 100644 --- a/templates/flatcar.yml.j2 +++ b/templates/flatcar.yml.j2 @@ -19,7 +19,7 @@ hotfix_hours: 0 uuid_validity: last-3 provided_until: none - os_distro: almalinux + os_distro: flatcar os_version: '{{ catalog['os_version'] }}' tags: [] latest_checksum_url: {{ metadata['baseURL'] }}{{ metadata['releasepath'] }}/{{ metadata['checksumname'] }}