Skip to content

Commit

Permalink
Implement Docker image for new Unipept suffix array
Browse files Browse the repository at this point in the history
  • Loading branch information
pverscha committed Mar 19, 2024
1 parent bc0b735 commit f928a58
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 0 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/build_index_image.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# This is a basic workflow to help you get started with Actions

name: Build Index Image

# Controls when the workflow will run
on:
workflow_dispatch:
push:
branches:
- 'main'
tags:
- 'v*.*.*'

permissions:
packages: write

jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
- name: Login to GHCR
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Docker meta
id: metadata
uses: docker/metadata-action@v3
with:
images: |
ghcr.io/unipept/unipept-index
tags: |
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=sha
- name: Build and push
uses: docker/build-push-action@v3
with:
context: 'unipept-index'
platforms: linux/amd64,linux/arm64
push: true
file: 'unipept-index/Dockerfile'
tags: ${{ steps.metadata.outputs.tags }}
labels: ${{ steps.metadata.outputs.labels }}


26 changes: 26 additions & 0 deletions examples/suffix-array/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
version: '3.7'

services:
db:
image: 26aa16358195
environment:
# Valid values: swissprot, trembl
DB_TYPES: swissprot
DB_SOURCES: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
restart: always
ports:
- "3000:3000"
volumes:
# Volume that's being used to store the Unipept index. This index is a special kind of structure that can easily
# be queried and filtered by taxa (if required). The index will be created on first run of an image (if it is
# not yet present in the folder that's mounted) and will greatly increase the build speed of the database.
- type: bind
# Change this to a folder on your hard drive where the Unipept index could be stored.
source: /Users/pverscha/Documents/Unipept/data/index
target: /index
- type: bind
source: /Users/pverscha/Documents/Unipept/data/temp
target: /tmp
- type: bind
source: /Users/pverscha/Documents/Unipept/data/suffix-array
target: /suffix-array
64 changes: 64 additions & 0 deletions unipept-index/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
FROM ubuntu:latest

LABEL maintainer="Pieter Verschaffelt <[email protected]>"

RUN apt update && \
apt install -y \
git \
wget \
unzip \
gawk \
binutils \
gcc \
libssl-dev \
uuid-runtime \
pv \
pigz \
parallel \
curl \
sudo \
lz4 \
dos2unix \
pkg-config \
clang \
cmake

RUN git clone --depth 1 https://github.com/unipept/unipept-database
# Use the webserver branch for now
RUN git clone --depth 1 -b webserver https://github.com/BramDevlaminck/Thesis_rust_implementations
# Initialize all submodules for the suffix array
RUN cd Thesis_rust_implementations && git submodule update --init --recursive

# Make a directory that contains the initialization data for this script
RUN mkdir "/scripts"
COPY "initialize_container.sh" "/scripts/initialize_container.sh"
# Make sure that this script is executable in the container
RUN chmod u+x "/scripts/initialize_container.sh"

# Install Rust toolchain (https://rustup.rs/)
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y
RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc
ENV PATH="/root/.cargo/bin:${PATH}"

# Compile Rust binaries for the database
RUN /unipept-database/scripts/build_binaries.sh

# Clean up database build artifacts so they don't end up in the image
RUN rm -rf /unipept-database/scripts/helper_scripts/unipept-database-rs/target/

# Compile Rust binaries for the unipept index
RUN cd /Thesis_rust_implementations && cargo build --release

# Uninstall Rust again to keep the image size down
RUN rustup self uninstall -y

# Database types that should be processed by this image. Delimited by comma's.
ENV DB_TYPES swissprot
# Database URLs that should be downloaded and processed by this container. Delimited by comma's, n'th item in this list
# should correspond to n'th db type given in DB_TYPES arg.
ENV DB_SOURCES https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
# How much memory is the database construction script allowed to use for sorting?
ENV SORT_MEMORY 2G

# Start initialization of the index construction process once the container is constructed.
CMD "/scripts/initialize_container.sh"
41 changes: 41 additions & 0 deletions unipept-index/initialize_container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#! /bin/bash

# Exit immediately when an error occurs
set -e
# Exit when a command in one of the pipelines fails
set -o pipefail

# Check if the index is already present and only needs to be started (or needs to be constructed if no data is available)
if [ ! -f "/suffix-array/.completed" ]
then
echo "Started construction of the suffix array"

# Clear the suffix_array directory and restart the build of the index
rm -rf /suffix-array && mkdir -p /suffix-array

# Clear the temp directory
rm -rf /tmp/*

# First, build the input files for the database and store them in the temporary directory
/unipept-database/scripts/build_database.sh -i '/index' -d '/tmp' -m $SORT_MEMORY database $DB_TYPES $DB_SOURCES '/tmp/tables'

# Move the required files for the suffix array to its directory (and extract only the required columns)
lz4cat /tmp/tables/uniprot_entries.tsv.lz4 | cut -f2,4,7 > /suffix-array/proteins.tsv
lz4cat /tmp/tables/taxons.tsv.lz4 > /suffix-array/taxons.tsv

# Remove all other database files
rm -rf '/tmp/tables'

# Now, construct the actual suffix array
/Thesis_rust_implementations/target/release/suffixarray_builder -d /suffix-array/proteins.tsv -t /suffix-array/taxons.tsv --sample-rate 3 -o /suffix-array/sa.bin

# Write the `.completed` file to indicate that this suffix array build is complete
touch /suffix-array/.completed

echo "Finished construction of the suffix array"
fi

echo "Start loading suffix array"

# At this point, we know that the suffix array exists and we can start it.
/Thesis_rust_implementations/target/release/suffixarray_server -d /suffix-array/proteins.tsv -t /suffix-array/taxons.tsv -i /suffix-array/sa.bin

0 comments on commit f928a58

Please sign in to comment.