Skip to content

Commit

Permalink
Speed up data import
Browse files Browse the repository at this point in the history
It turns out that dateparser was horribly inefficient. Replacing that
with python-dateutil and using a bulk insert method for sqlalchemy has
now cut down the startup time from about 40-45 seconds down to 2-3
seconds. 🎉

Signed-off-by: Major Hayden <[email protected]>
  • Loading branch information
major authored and F-X64 committed Jun 13, 2024
1 parent 6f2e87e commit 0aa81d6
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 208 deletions.
53 changes: 21 additions & 32 deletions cid/crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from typing import Any

import dateparser
from dateutil import parser
from packaging.version import Version
from sqlalchemy import desc
from sqlalchemy.orm import Session
Expand Down Expand Up @@ -95,43 +95,32 @@ def import_aws_images(db: Session, images: list) -> None:

for image in images:
# sqlite requires dates to be in Python's datetime format.
creation_date = dateparser.parse(image.get("CreationDate"))
deprecation_time = dateparser.parse(image.get("DeprecationTime"))
creation_date = parser.parse(image.get("CreationDate"))
deprecation_time = parser.parse(image.get("DeprecationTime"))

# Extract the RHEL version number from the image name.
image_name = extract_aws_version(image.get("Name"))

image_obj = AwsImage(
id=image.get("ImageId"),
name=image.get("Name"),
arch=image.get("Architecture"),
version=image_name,
imageId=image.get("ImageId"),
date=creation_date,
virt=image.get("VirtualizationType"),
provider=image.get("ImageOwnerAlias"),
region=image.get("Region"),
imageLocation=image.get("ImageLocation"),
imageType=image.get("ImageType"),
public=image.get("Public"),
ownerId=image.get("OwnerId"),
platformDetails=image.get("PlatformDetails"),
usageOperation=image.get("UsageOperation"),
state=image.get("State"),
blockDeviceMappings=image.get("BlockDeviceMappings"),
description=image.get("Description"),
enaSupport=image.get("EnaSupport"),
hypervisor=image.get("Hypervisor"),
rootDeviceName=image.get("RootDeviceName"),
rootDeviceType=image.get("RootDeviceType"),
sriovNetSupport=image.get("SriovNetSupport"),
deprecationTime=deprecation_time,
)
import_queue.append(image_obj)
# AWS has a LOT of data. We generate a list of dictionaries and then
# import them in bulk at the end.
image_dict = {
"id": image.get("ImageId"),
"name": image.get("Name"),
"arch": image.get("Architecture"),
"version": image_name,
"imageId": image.get("ImageId"),
"date": creation_date,
"provider": image.get("ImageOwnerAlias"),
"region": image.get("Region"),
"description": image.get("Description"),
"deprecationTime": deprecation_time,
}
import_queue.append(image_dict)

logger.info("Adding %s AWS images to the database", len(import_queue))

db.add_all(import_queue)
# This lower-level method is more efficient for inserting lots of rows at once.
db.execute(AwsImage.__table__.insert(), import_queue)
db.commit()


Expand Down Expand Up @@ -163,7 +152,7 @@ def import_google_images(db: Session, images: list) -> None:

for image in images:
# sqlite requires dates to be in Python's datetime format.
creation_timestamp = dateparser.parse(image.get("creationTimestamp"))
creation_timestamp = parser.parse(image.get("creationTimestamp"))

# Extract the RHEL version number from the image name.
image_name = extract_google_version(image.get("name"))
Expand Down
16 changes: 1 addition & 15 deletions cid/models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Database models."""

from sqlalchemy import JSON, Boolean, Column, DateTime, Integer, String
from sqlalchemy import JSON, Column, DateTime, Integer, String

from cid.database import Base

Expand All @@ -17,23 +17,9 @@ class AwsImage(Base):
version = Column(String)
imageId = Column(String)
date = Column(DateTime)
virt = Column(String)
provider = Column(String)
region = Column(String)
imageLocation = Column(String)
imageType = Column(String)
public = Column(Boolean)
ownerId = Column(String)
platformDetails = Column(String)
usageOperation = Column(String)
state = Column(String)
blockDeviceMappings = Column(JSON)
description = Column(String)
enaSupport = Column(Boolean)
hypervisor = Column(String)
rootDeviceName = Column(String)
rootDeviceType = Column(String)
sriovNetSupport = Column(String)
deprecationTime = Column(DateTime)


Expand Down
Loading

0 comments on commit 0aa81d6

Please sign in to comment.