From f928a582de0f723de9f304254503d1c686b60d5d Mon Sep 17 00:00:00 2001 From: Pieter Verschaffelt Date: Tue, 19 Mar 2024 09:20:41 +0100 Subject: [PATCH] Implement Docker image for new Unipept suffix array --- .github/workflows/build_index_image.yml | 53 ++++++++++++++++++++ examples/suffix-array/docker-compose.yml | 26 ++++++++++ unipept-index/Dockerfile | 64 ++++++++++++++++++++++++ unipept-index/initialize_container.sh | 41 +++++++++++++++ 4 files changed, 184 insertions(+) create mode 100644 .github/workflows/build_index_image.yml create mode 100644 examples/suffix-array/docker-compose.yml create mode 100644 unipept-index/Dockerfile create mode 100644 unipept-index/initialize_container.sh diff --git a/.github/workflows/build_index_image.yml b/.github/workflows/build_index_image.yml new file mode 100644 index 0000000..e98d958 --- /dev/null +++ b/.github/workflows/build_index_image.yml @@ -0,0 +1,53 @@ +# This is a basic workflow to help you get started with Actions + +name: Build Index Image + +# Controls when the workflow will run +on: + workflow_dispatch: + push: + branches: + - 'main' + tags: + - 'v*.*.*' + +permissions: + packages: write + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Login to GHCR + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Docker meta + id: metadata + uses: docker/metadata-action@v3 + with: + images: | + ghcr.io/unipept/unipept-index + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha + - name: Build and push + uses: docker/build-push-action@v3 + with: + context: 'unipept-index' + platforms: linux/amd64,linux/arm64 + push: true + file: 'unipept-index/Dockerfile' + tags: ${{ steps.metadata.outputs.tags }} + labels: ${{ steps.metadata.outputs.labels }} + + diff --git a/examples/suffix-array/docker-compose.yml b/examples/suffix-array/docker-compose.yml new file mode 100644 index 0000000..51ef9ea --- /dev/null +++ b/examples/suffix-array/docker-compose.yml @@ -0,0 +1,26 @@ +version: '3.7' + +services: + db: + image: 26aa16358195 + environment: + # Valid values: swissprot, trembl + DB_TYPES: swissprot + DB_SOURCES: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz + restart: always + ports: + - "3000:3000" + volumes: + # Volume that's being used to store the Unipept index. This index is a special kind of structure that can easily + # be queried and filtered by taxa (if required). The index will be created on first run of an image (if it is + # not yet present in the folder that's mounted) and will greatly increase the build speed of the database. + - type: bind + # Change this to a folder on your hard drive where the Unipept index could be stored. + source: /Users/pverscha/Documents/Unipept/data/index + target: /index + - type: bind + source: /Users/pverscha/Documents/Unipept/data/temp + target: /tmp + - type: bind + source: /Users/pverscha/Documents/Unipept/data/suffix-array + target: /suffix-array diff --git a/unipept-index/Dockerfile b/unipept-index/Dockerfile new file mode 100644 index 0000000..836d173 --- /dev/null +++ b/unipept-index/Dockerfile @@ -0,0 +1,64 @@ +FROM ubuntu:latest + +LABEL maintainer="Pieter Verschaffelt " + +RUN apt update && \ + apt install -y \ + git \ + wget \ + unzip \ + gawk \ + binutils \ + gcc \ + libssl-dev \ + uuid-runtime \ + pv \ + pigz \ + parallel \ + curl \ + sudo \ + lz4 \ + dos2unix \ + pkg-config \ + clang \ + cmake + +RUN git clone --depth 1 https://github.com/unipept/unipept-database +# Use the webserver branch for now +RUN git clone --depth 1 -b webserver https://github.com/BramDevlaminck/Thesis_rust_implementations +# Initialize all submodules for the suffix array +RUN cd Thesis_rust_implementations && git submodule update --init --recursive + +# Make a directory that contains the initialization data for this script +RUN mkdir "/scripts" +COPY "initialize_container.sh" "/scripts/initialize_container.sh" +# Make sure that this script is executable in the container +RUN chmod u+x "/scripts/initialize_container.sh" + +# Install Rust toolchain (https://rustup.rs/) +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y +RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc +ENV PATH="/root/.cargo/bin:${PATH}" + +# Compile Rust binaries for the database +RUN /unipept-database/scripts/build_binaries.sh + +# Clean up database build artifacts so they don't end up in the image +RUN rm -rf /unipept-database/scripts/helper_scripts/unipept-database-rs/target/ + +# Compile Rust binaries for the unipept index +RUN cd /Thesis_rust_implementations && cargo build --release + +# Uninstall Rust again to keep the image size down +RUN rustup self uninstall -y + +# Database types that should be processed by this image. Delimited by comma's. +ENV DB_TYPES swissprot +# Database URLs that should be downloaded and processed by this container. Delimited by comma's, n'th item in this list +# should correspond to n'th db type given in DB_TYPES arg. +ENV DB_SOURCES https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz +# How much memory is the database construction script allowed to use for sorting? +ENV SORT_MEMORY 2G + +# Start initialization of the index construction process once the container is constructed. +CMD "/scripts/initialize_container.sh" diff --git a/unipept-index/initialize_container.sh b/unipept-index/initialize_container.sh new file mode 100644 index 0000000..91376e4 --- /dev/null +++ b/unipept-index/initialize_container.sh @@ -0,0 +1,41 @@ +#! /bin/bash + +# Exit immediately when an error occurs +set -e +# Exit when a command in one of the pipelines fails +set -o pipefail + +# Check if the index is already present and only needs to be started (or needs to be constructed if no data is available) +if [ ! -f "/suffix-array/.completed" ] +then + echo "Started construction of the suffix array" + + # Clear the suffix_array directory and restart the build of the index + rm -rf /suffix-array && mkdir -p /suffix-array + + # Clear the temp directory + rm -rf /tmp/* + + # First, build the input files for the database and store them in the temporary directory + /unipept-database/scripts/build_database.sh -i '/index' -d '/tmp' -m $SORT_MEMORY database $DB_TYPES $DB_SOURCES '/tmp/tables' + + # Move the required files for the suffix array to its directory (and extract only the required columns) + lz4cat /tmp/tables/uniprot_entries.tsv.lz4 | cut -f2,4,7 > /suffix-array/proteins.tsv + lz4cat /tmp/tables/taxons.tsv.lz4 > /suffix-array/taxons.tsv + + # Remove all other database files + rm -rf '/tmp/tables' + + # Now, construct the actual suffix array + /Thesis_rust_implementations/target/release/suffixarray_builder -d /suffix-array/proteins.tsv -t /suffix-array/taxons.tsv --sample-rate 3 -o /suffix-array/sa.bin + + # Write the `.completed` file to indicate that this suffix array build is complete + touch /suffix-array/.completed + + echo "Finished construction of the suffix array" +fi + +echo "Start loading suffix array" + +# At this point, we know that the suffix array exists and we can start it. +/Thesis_rust_implementations/target/release/suffixarray_server -d /suffix-array/proteins.tsv -t /suffix-array/taxons.tsv -i /suffix-array/sa.bin