-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement Docker image for new Unipept suffix array
- Loading branch information
Showing
4 changed files
with
184 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# This is a basic workflow to help you get started with Actions | ||
|
||
name: Build Index Image | ||
|
||
# Controls when the workflow will run | ||
on: | ||
workflow_dispatch: | ||
push: | ||
branches: | ||
- 'main' | ||
tags: | ||
- 'v*.*.*' | ||
|
||
permissions: | ||
packages: write | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v2 | ||
- name: Set up QEMU | ||
uses: docker/setup-qemu-action@v2 | ||
- name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v2 | ||
- name: Login to GHCR | ||
uses: docker/login-action@v2 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.repository_owner }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
- name: Docker meta | ||
id: metadata | ||
uses: docker/metadata-action@v3 | ||
with: | ||
images: | | ||
ghcr.io/unipept/unipept-index | ||
tags: | | ||
type=semver,pattern={{version}} | ||
type=semver,pattern={{major}}.{{minor}} | ||
type=sha | ||
- name: Build and push | ||
uses: docker/build-push-action@v3 | ||
with: | ||
context: 'unipept-index' | ||
platforms: linux/amd64,linux/arm64 | ||
push: true | ||
file: 'unipept-index/Dockerfile' | ||
tags: ${{ steps.metadata.outputs.tags }} | ||
labels: ${{ steps.metadata.outputs.labels }} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
version: '3.7' | ||
|
||
services: | ||
db: | ||
image: 26aa16358195 | ||
environment: | ||
# Valid values: swissprot, trembl | ||
DB_TYPES: swissprot | ||
DB_SOURCES: https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz | ||
restart: always | ||
ports: | ||
- "3000:3000" | ||
volumes: | ||
# Volume that's being used to store the Unipept index. This index is a special kind of structure that can easily | ||
# be queried and filtered by taxa (if required). The index will be created on first run of an image (if it is | ||
# not yet present in the folder that's mounted) and will greatly increase the build speed of the database. | ||
- type: bind | ||
# Change this to a folder on your hard drive where the Unipept index could be stored. | ||
source: /Users/pverscha/Documents/Unipept/data/index | ||
target: /index | ||
- type: bind | ||
source: /Users/pverscha/Documents/Unipept/data/temp | ||
target: /tmp | ||
- type: bind | ||
source: /Users/pverscha/Documents/Unipept/data/suffix-array | ||
target: /suffix-array |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
FROM ubuntu:latest | ||
|
||
LABEL maintainer="Pieter Verschaffelt <[email protected]>" | ||
|
||
RUN apt update && \ | ||
apt install -y \ | ||
git \ | ||
wget \ | ||
unzip \ | ||
gawk \ | ||
binutils \ | ||
gcc \ | ||
libssl-dev \ | ||
uuid-runtime \ | ||
pv \ | ||
pigz \ | ||
parallel \ | ||
curl \ | ||
sudo \ | ||
lz4 \ | ||
dos2unix \ | ||
pkg-config \ | ||
clang \ | ||
cmake | ||
|
||
RUN git clone --depth 1 https://github.com/unipept/unipept-database | ||
# Use the webserver branch for now | ||
RUN git clone --depth 1 -b webserver https://github.com/BramDevlaminck/Thesis_rust_implementations | ||
# Initialize all submodules for the suffix array | ||
RUN cd Thesis_rust_implementations && git submodule update --init --recursive | ||
|
||
# Make a directory that contains the initialization data for this script | ||
RUN mkdir "/scripts" | ||
COPY "initialize_container.sh" "/scripts/initialize_container.sh" | ||
# Make sure that this script is executable in the container | ||
RUN chmod u+x "/scripts/initialize_container.sh" | ||
|
||
# Install Rust toolchain (https://rustup.rs/) | ||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | bash -s -- -y | ||
RUN echo 'source $HOME/.cargo/env' >> $HOME/.bashrc | ||
ENV PATH="/root/.cargo/bin:${PATH}" | ||
|
||
# Compile Rust binaries for the database | ||
RUN /unipept-database/scripts/build_binaries.sh | ||
|
||
# Clean up database build artifacts so they don't end up in the image | ||
RUN rm -rf /unipept-database/scripts/helper_scripts/unipept-database-rs/target/ | ||
|
||
# Compile Rust binaries for the unipept index | ||
RUN cd /Thesis_rust_implementations && cargo build --release | ||
|
||
# Uninstall Rust again to keep the image size down | ||
RUN rustup self uninstall -y | ||
|
||
# Database types that should be processed by this image. Delimited by comma's. | ||
ENV DB_TYPES swissprot | ||
# Database URLs that should be downloaded and processed by this container. Delimited by comma's, n'th item in this list | ||
# should correspond to n'th db type given in DB_TYPES arg. | ||
ENV DB_SOURCES https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz | ||
# How much memory is the database construction script allowed to use for sorting? | ||
ENV SORT_MEMORY 2G | ||
|
||
# Start initialization of the index construction process once the container is constructed. | ||
CMD "/scripts/initialize_container.sh" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#! /bin/bash | ||
|
||
# Exit immediately when an error occurs | ||
set -e | ||
# Exit when a command in one of the pipelines fails | ||
set -o pipefail | ||
|
||
# Check if the index is already present and only needs to be started (or needs to be constructed if no data is available) | ||
if [ ! -f "/suffix-array/.completed" ] | ||
then | ||
echo "Started construction of the suffix array" | ||
|
||
# Clear the suffix_array directory and restart the build of the index | ||
rm -rf /suffix-array && mkdir -p /suffix-array | ||
|
||
# Clear the temp directory | ||
rm -rf /tmp/* | ||
|
||
# First, build the input files for the database and store them in the temporary directory | ||
/unipept-database/scripts/build_database.sh -i '/index' -d '/tmp' -m $SORT_MEMORY database $DB_TYPES $DB_SOURCES '/tmp/tables' | ||
|
||
# Move the required files for the suffix array to its directory (and extract only the required columns) | ||
lz4cat /tmp/tables/uniprot_entries.tsv.lz4 | cut -f2,4,7 > /suffix-array/proteins.tsv | ||
lz4cat /tmp/tables/taxons.tsv.lz4 > /suffix-array/taxons.tsv | ||
|
||
# Remove all other database files | ||
rm -rf '/tmp/tables' | ||
|
||
# Now, construct the actual suffix array | ||
/Thesis_rust_implementations/target/release/suffixarray_builder -d /suffix-array/proteins.tsv -t /suffix-array/taxons.tsv --sample-rate 3 -o /suffix-array/sa.bin | ||
|
||
# Write the `.completed` file to indicate that this suffix array build is complete | ||
touch /suffix-array/.completed | ||
|
||
echo "Finished construction of the suffix array" | ||
fi | ||
|
||
echo "Start loading suffix array" | ||
|
||
# At this point, we know that the suffix array exists and we can start it. | ||
/Thesis_rust_implementations/target/release/suffixarray_server -d /suffix-array/proteins.tsv -t /suffix-array/taxons.tsv -i /suffix-array/sa.bin |