Check and test ChEBI updates #215
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Check and test ChEBI updates | |
on: | |
workflow_dispatch: | |
pull_request: # tests whether it is working on PR | |
paths: | |
- '.github/workflows/chebi.yml' | |
schedule: | |
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month | |
permissions: | |
contents: write | |
pages: write | |
id-token: write | |
issues: write | |
jobs: | |
check_new_release: | |
runs-on: ubuntu-latest | |
name: Check latest release date | |
outputs: | |
RELEASE_NUMBER: ${{ steps.check_download.outputs.RELEASE_NUMBER }} | |
CURRENT_RELEASE_NUMBER: ${{ steps.check_download.outputs.CURRENT_RELEASE_NUMBER }} | |
NEW_RELEASE: ${{ steps.check_download.outputs.NEW_RELEASE }} | |
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }} | |
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }} | |
steps: | |
# checkout the repository | |
- name: Checkout | |
uses: actions/checkout@v4 | |
# check the release date for the latest ChEBI release | |
- name: Check for new ChEBI release | |
id: check_download | |
run: | | |
## Read config | |
. datasources/chebi/config . | |
echo 'Accessing the ChEBI archive' | |
wget https://ftp.ebi.ac.uk/pub/databases/chebi/archive/ -O chebi_index.html | |
echo "CURRENT_RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT | |
## Check date for last element in index | |
##Extract the date from the latest release (up to the day) | |
date_new=$(tail -4 chebi_index.html | head -1 | grep -oP '<td align="right">\K[0-9-]+\s[0-9:]+(?=\s+</td>)' | awk '{print $1}') | |
release=$(tail -4 chebi_index.html | head -1 | grep -oP '(?<=a href="rel)\d\d\d') | |
#https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter | |
echo "RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT | |
##Extract the date from the ChEBI README file | |
date_old=$date | |
#Store dates to output | |
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT | |
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT | |
##Compare the dates and set variable if date_new is more recent | |
timestamp1=$(date -d "$date_new" +%s) | |
timestamp2=$(date -d "$date_old" +%s) | |
if [ "$timestamp1" -gt "$timestamp2" ]; then | |
echo 'New release available', "$release" | |
echo "NEW_RELEASE=true" >> $GITHUB_OUTPUT | |
else | |
echo 'No new release available' | |
fi | |
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old" | |
##Clean up | |
rm chebi_index.html | |
test_sdf_processing: | |
name: Test release | |
if: needs.check_new_release.outputs.NEW_RELEASE | |
needs: check_new_release | |
env: | |
RELEASE_NUMBER: ${{ needs.check_new_release.outputs.RELEASE_NUMBER }} | |
DATE_OLD: ${{ needs.check_new_release.outputs.DATE_OLD }} | |
DATE_NEW: ${{ needs.check_new_release.outputs.DATE_NEW }} | |
CURRENT_RELEASE_NUMBER: ${{ needs.check_new_release.outputs.CURRENT_RELEASE_NUMBER }} | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
- name: Download SDF for new release | |
run: | | |
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV | |
##Download ChEBI SDF file | |
echo $RELEASE_NUMBER | |
# Store outputs from previous job in environment variables | |
echo "RELEASE_NUMBER=$RELEASE_NUMBER" >> $GITHUB_ENV | |
echo "CURRENT_RELEASE_NUMBER=$CURRENT_RELEASE_NUMBER" >> $GITHUB_ENV | |
url_release="https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel$RELEASE_NUMBER/SDF/" | |
echo "URL_RELEASE=$url_release" >> $GITHUB_ENV | |
wget "https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel${RELEASE_NUMBER}/SDF/ChEBI_complete_3star.sdf.gz" | |
##Unzip gz file: | |
gunzip ChEBI_complete_3star.sdf.gz #TODO replace by config var | |
##Check file size if available | |
ls | |
##Print file size | |
# Set up vars from config file | |
chmod +x datasources/chebi/config | |
. datasources/chebi/config . | |
##Create temp. folder to store the data in | |
mkdir -p mapping_preprocessing/datasources/chebi/data | |
# step 4: run the Java .jar for ChEBI preprocessing | |
- name: Set up Java | |
uses: actions/setup-java@v4 | |
with: | |
java-version: '11' | |
distribution: 'temurin' | |
# Download current version from Zenodo | |
#- name: Download current mapping file from Zenodo | |
# id: download_current_zenodo | |
# env: | |
# zenodo_token: ${{ secrets.ZENODO }} | |
# run: | | |
# # Set up vars from config file | |
# chmod +x datasources/chebi/config | |
# . datasources/chebi/config . | |
# echo "file name: $to_check_from_zenodo" | |
# # Request Zenodo API to download the file | |
# curl -H "Authorization: Bearer $zenodo_token" -LJO https://zenodo.org/api/record/$zenodo_file_id/$to_check_from_zenodo | |
- name: Test SDF processing | |
id: sdf_process | |
run: | | |
inputFile="ChEBI_complete_3star.sdf" | |
mkdir new | |
outputDir="datasources/chebi/recentData/" | |
# Run Java program and capture its exit code | |
java -cp java/target/mapping_prerocessing-0.0.1-jar-with-dependencies.jar org.sec2pri.chebi_sdf "$inputFile" "$outputDir" | |
# Check the exit status of the Java program | |
if [ $? -eq 0 ]; then | |
# Java program succeeded | |
echo "Successful preprocessing of ChEBI data." | |
echo "FAILED=false" >> $GITHUB_ENV | |
else | |
# Java program failed | |
echo "Failed preprocessing of ChEBI data." | |
echo "FAILED=true" >> $GITHUB_ENV | |
fi | |
- name: RegEx and Diff test | |
if: | |
${{ env.FAILED == 'false' }} | |
run: | | |
chmod +x datasources/chebi/config | |
. datasources/chebi/config . | |
old="datasources/chebi/data/$to_check_from_zenodo" | |
new="datasources/chebi/recentData/$to_check_from_zenodo" | |
# remove headers | |
sed -i '1d' "$new" | |
sed -i '1d' "$old" | |
# qc integrity of IDs | |
wget -nc https://raw.githubusercontent.com/bridgedb/datasources/main/datasources.tsv | |
CHEBI_ID=$(awk -F '\t' '$1 == "ChEBI" {print $10}' datasources.tsv) | |
# Split the file into two separate files for each column | |
awk -F '\t' '{print $1}' $new > column1.txt | |
awk -F '\t' '{print $2}' $new > column2.txt | |
# Use grep to check if any line in the primary column doesn't match the pattern | |
if grep -nqvE "$CHEBI_ID" "column1.txt"; then | |
echo "All lines in the primary column match the pattern." | |
else | |
echo "Error: At least one line in the primary column does not match pattern." | |
grep -nvE "^$CHEBI_ID$" "column1.txt" | |
echo "FAILED=true" >> $GITHUB_ENV | |
exit 1 | |
fi | |
# Use grep to check if any line in the secondary column doesn't match the pattern | |
if grep -nqvE "$CHEBI_ID" "column1.txt"; then | |
echo "All lines in the secondary column match the pattern." | |
else | |
echo "Error: At least one line in the secondary column does not match pattern." | |
grep -nqvE "$CHEBI_ID" "column2.txt" | |
echo "FAILED=true" >> $GITHUB_ENV | |
exit 1 | |
fi | |
# sort them | |
cat "$old" | sort | tr -d "\r" > ids_old.txt | |
cat "$new" | sort | tr -d "\r" > ids_new.txt | |
echo "Performing diff between the sorted lists of IDs" | |
# Perform a diff between the sorted lists of IDs | |
output_file=diff.txt | |
diff -u ids_old.txt ids_new.txt > $output_file || true | |
# retrieve new lines | |
added=$(grep '^+CHEBI' "$output_file" | sed 's/-//g') || true | |
# retrieve removed lines | |
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true | |
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed")) | |
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added")) | |
added=$added_filtered | |
removed=$removed_filtered | |
# count them | |
count_removed=$(printf "$removed" | wc -l) || true | |
count_added=$(printf "$added" | wc -l) || true | |
# make sure we are not counting empty lines | |
if [ -z "$removed" ]; then | |
count_removed=0 | |
removed="None" | |
fi | |
if [ -z "$added" ]; then | |
count_added=0 | |
added="None" | |
fi | |
echo ________________________________________________ | |
echo " removed pairs " | |
echo ________________________________________________ | |
echo "$removed" | |
echo ________________________________________________ | |
echo " added pairs " | |
echo ________________________________________________ | |
echo "$added" | |
echo _________________________________________________ | |
echo "What's changed:" | |
echo "- Added id pairs: $count_added" | |
echo "- Removed id pairs: $count_removed" | |
# Store to env to use in issue | |
echo "ADDED=$count_added" >> $GITHUB_ENV | |
echo "REMOVED=$count_removed" >> $GITHUB_ENV | |
count=$(expr $count_added + $count_removed) || true | |
echo "COUNT=$count" >> $GITHUB_ENV | |
total_old=$(cat "$old" | wc -l) || true | |
change=$((100 * count / total_old)) | |
echo "CHANGE=$change" >> $GITHUB_ENV | |
- name: 'Upload processed data as artifacts' | |
uses: actions/upload-artifact@v4 | |
with: | |
name: chebi_processed | |
path: datasources/chebi/recentData/* | |
- uses: JasonEtco/create-an-issue@v2 | |
if: ${{ env.COUNT != 0 }} | |
name: Post issue about update availability | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: ChEBI | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md | |
update_existing: true | |
- uses: JasonEtco/create-an-issue@v2 | |
name: Post issue about failing test | |
if: ${{ env.FAILED == 'true' }} | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: chebi | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md | |
update_existing: true |