Skip to content

Check and test ChEBI updates #205

Check and test ChEBI updates

Check and test ChEBI updates #205

Workflow file for this run

name: Check and test ChEBI updates
on:
workflow_dispatch:
pull_request: # tests whether it is working on PR
paths:
- '.github/workflows/chebi.yml'
schedule:
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
permissions:
contents: write
pages: write
id-token: write
issues: write
jobs:
check_new_release:
runs-on: ubuntu-latest
name: Check latest release date
outputs:
RELEASE_NUMBER: ${{ steps.check_download.outputs.RELEASE_NUMBER }}
CURRENT_RELEASE_NUMBER: ${{ steps.check_download.outputs.CURRENT_RELEASE_NUMBER }}
NEW_RELEASE: ${{ steps.check_download.outputs.NEW_RELEASE }}
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }}
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }}
steps:
# checkout the repository
- name: Checkout
uses: actions/checkout@v4
# check the release date for the latest ChEBI release
- name: Check for new ChEBI release
id: check_download
run: |
## Read config
. datasources/chebi/config .
echo 'Accessing the ChEBI archive'
wget https://ftp.ebi.ac.uk/pub/databases/chebi/archive/ -O chebi_index.html
echo "CURRENT_RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT
## Check date for last element in index
##Extract the date from the latest release (up to the day)
date_new=$(tail -4 chebi_index.html | head -1 | grep -oP '<td align="right">\K[0-9-]+\s[0-9:]+(?=\s+</td>)' | awk '{print $1}')
release=$(tail -4 chebi_index.html | head -1 | grep -oP '(?<=a href="rel)\d\d\d')
#https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter
echo "RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT
##Extract the date from the ChEBI README file
date_old=$date
#Store dates to output
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT
##Compare the dates and set variable if date_new is more recent
timestamp1=$(date -d "$date_new" +%s)
timestamp2=$(date -d "$date_old" +%s)
if [ "$timestamp1" -gt "$timestamp2" ]; then
echo 'New release available', "$release"
echo "NEW_RELEASE=true" >> $GITHUB_OUTPUT
else
echo 'No new release available'
fi
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old"
##Clean up
rm chebi_index.html
test_sdf_processing:
name: Test release
if: needs.check_new_release.outputs.NEW_RELEASE
needs: check_new_release
env:
RELEASE_NUMBER: ${{ needs.check_new_release.outputs.RELEASE_NUMBER }}
DATE_OLD: ${{ needs.check_new_release.outputs.DATE_OLD }}
DATE_NEW: ${{ needs.check_new_release.outputs.DATE_NEW }}
CURRENT_RELEASE_NUMBER: ${{ needs.check_new_release.outputs.CURRENT_RELEASE_NUMBER }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Download SDF for new release
run: |
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV
##Download ChEBI SDF file
echo $RELEASE_NUMBER
# Store outputs from previous job in environment variables
echo "RELEASE_NUMBER=$RELEASE_NUMBER" >> $GITHUB_ENV
echo "CURRENT_RELEASE_NUMBER=$CURRENT_RELEASE_NUMBER" >> $GITHUB_ENV
url_release="https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel$RELEASE_NUMBER/SDF/"
echo "URL_RELEASE=$url_release" >> $GITHUB_ENV
wget "https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel${RELEASE_NUMBER}/SDF/ChEBI_complete_3star.sdf.gz"
##Unzip gz file:
gunzip ChEBI_complete_3star.sdf.gz #TODO replace by config var
##Check file size if available
ls
##Print file size
# Set up vars from config file
chmod +x datasources/chebi/config
. datasources/chebi/config .
##Create temp. folder to store the data in
mkdir -p mapping_preprocessing/datasources/chebi/data
# step 4: run the Java .jar for ChEBI preprocessing
- name: Set up Java
uses: actions/setup-java@v4
with:
java-version: '11'
distribution: 'temurin'
# Download current version from Zenodo
#- name: Download current mapping file from Zenodo
# id: download_current_zenodo
# env:
# zenodo_token: ${{ secrets.ZENODO }}
# run: |
# # Set up vars from config file
# chmod +x datasources/chebi/config
# . datasources/chebi/config .
# echo "file name: $to_check_from_zenodo"
# # Request Zenodo API to download the file
# curl -H "Authorization: Bearer $zenodo_token" -LJO https://zenodo.org/api/record/$zenodo_file_id/$to_check_from_zenodo
- name: Test SDF processing
id: sdf_process
run: |
inputFile="ChEBI_complete_3star.sdf"
mkdir new
outputDir="datasources/chebi/recentData/"
# Run Java program and capture its exit code
java -cp java/target/mapping_prerocessing-0.0.1-jar-with-dependencies.jar org.sec2pri.chebi_sdf "$inputFile" "$outputDir"
# Check the exit status of the Java program
if [ $? -eq 0 ]; then
# Java program succeeded
echo "Successful preprocessing of ChEBI data."
echo "FAILED=false" >> $GITHUB_ENV
else
# Java program failed
echo "Failed preprocessing of ChEBI data."
echo "FAILED=true" >> $GITHUB_ENV
fi
- name: RegEx and Diff test
if:
${{ env.FAILED == 'false' }}
run: |
chmod +x datasources/chebi/config
. datasources/chebi/config .
old="datasources/chebi/data/$to_check_from_zenodo"
new="datasources/chebi/recentData/$to_check_from_zenodo"
# remove headers
sed -i '1d' "$new"
sed -i '1d' "$old"
# qc integrity of IDs
wget -nc https://raw.githubusercontent.com/bridgedb/datasources/main/datasources.tsv
CHEBI_ID=$(awk -F '\t' '$1 == "ChEBI" {print $10}' datasources.tsv)
# Split the file into two separate files for each column
awk -F '\t' '{print $1}' $new > column1.txt
awk -F '\t' '{print $2}' $new > column2.txt
# Use grep to check if any line in the primary column doesn't match the pattern
if grep -nqvE "$CHEBI_ID" "column1.txt"; then
echo "All lines in the primary column match the pattern."
else
echo "Error: At least one line in the primary column does not match pattern."
grep -nvE "^$CHEBI_ID$" "column1.txt"
echo "FAILED=true" >> $GITHUB_ENV
exit 1
fi
# Use grep to check if any line in the secondary column doesn't match the pattern
if grep -nqvE "$CHEBI_ID" "column1.txt"; then
echo "All lines in the secondary column match the pattern."
else
echo "Error: At least one line in the secondary column does not match pattern."
grep -nqvE "$CHEBI_ID" "column2.txt"
echo "FAILED=true" >> $GITHUB_ENV
exit 1
fi
# sort them
cat "$old" | sort | tr -d "\r" > ids_old.txt
cat "$new" | sort | tr -d "\r" > ids_new.txt
echo "Performing diff between the sorted lists of IDs"
# Perform a diff between the sorted lists of IDs
output_file=diff.txt
diff -u ids_old.txt ids_new.txt > $output_file || true
# retrieve new lines
added=$(grep '^+CHEBI' "$output_file" | sed 's/-//g') || true
# retrieve removed lines
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
# Create temporary files
tmp_added=$(mktemp)
tmp_withdrawn=$(mktemp)
tmp_removed=$(mktemp)
# Write the content of the added variable to the temporary file
echo "$added" > "$tmp_added"
# Retrieve matches and store them in another temporary file
grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true
# Append matches to the removed variable
if [ -n "$removed" ]; then
echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
else
cat "$tmp_withdrawn" > "$tmp_removed"
fi
# Remove matching lines from the added variable
sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added"
# Read the updated content back into the variables
added=$(cat "$tmp_added")
removed=$(cat "$tmp_removed")
# Clean up temporary files
rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
added=$(echo "$added" | sed '/Entry Withdrawn/d')
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
added=$added_filtered
removed=$removed_filtered
# count them
count_removed=$(printf "$removed" | wc -l) || true
count_added=$(printf "$added" | wc -l) || true
# make sure we are not counting empty lines
if [ -z "$removed" ]; then
count_removed=0
removed="None"
fi
if [ -z "$added" ]; then
count_added=0
added="None"
fi
echo ________________________________________________
echo " removed pairs "
echo ________________________________________________
echo "$removed"
echo ________________________________________________
echo " added pairs "
echo ________________________________________________
echo "$added"
echo _________________________________________________
echo "What's changed:"
echo "- Added id pairs: $count_added"
echo "- Removed id pairs: $count_removed"
# Store to env to use in issue
echo "ADDED=$count_added" >> $GITHUB_ENV
echo "REMOVED=$count_removed" >> $GITHUB_ENV
count=$(expr $count_added + $count_removed) || true
echo "COUNT=$count" >> $GITHUB_ENV
total_old=$(cat "$old" | wc -l) || true
change=$((100 * count / total_old))
echo "CHANGE=$change" >> $GITHUB_ENV
- name: 'Upload processed data as artifacts'
uses: actions/upload-artifact@v4
with:
name: chebi_processed
path: datasources/chebi/recentData/*
- uses: JasonEtco/create-an-issue@v2
if: ${{ env.COUNT != 0 }}
name: Post issue about update availability
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: ChEBI
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
update_existing: true
- uses: JasonEtco/create-an-issue@v2
name: Post issue about failing test
if: ${{ env.FAILED == 'true' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: chebi
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
update_existing: true
# Docker
trigger-docker-update:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Trigger docker release
uses: peter-evans/repository-dispatch@v3
with:
token: ${{ secrets.PING }}
repository: sec2pri/omicsFixID
event-type: update-event
client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "datasource": "ChEBI"}'