Check and test HMDB updates #36
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Check and test HMDB updates | |
on: | |
workflow_dispatch: | |
pull_request: # tests whether it is working on PR | |
paths: | |
- '.github/workflows/hmdb.yml' | |
schedule: | |
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month | |
permissions: | |
contents: write | |
pages: write | |
id-token: write | |
issues: write | |
jobs: | |
check_new_data: | |
runs-on: ubuntu-latest | |
name: Check the date of the latest data | |
outputs: | |
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }} | |
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }} | |
NEW_RELEASE: ${{ steps.check_download.outputs.NEW_RELEASE }} | |
steps: | |
# step 1: check the release date for the latest hmdb files | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Check for new hmdb files | |
id: check_download | |
run: | | |
## Setup | |
sudo apt-get install xml-twig-tools | |
##Extract the date from the hmdb config file | |
date_old=$(grep -E '^date=' datasources/hmdb/config | cut -d'=' -f2) | |
echo 'Accessing the hmdb data' | |
wget http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip | |
unzip hmdb_metabolites.zip | |
head hmdb_metabolites.xml | |
head hmdb_metabolites.xml | grep 'update_date' | sed 's/.*>\([0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}\).*/\1/' | |
date_new=$(head hmdb_metabolites.xml | grep 'update_date' | sed 's/.*>\([0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}\).*/\1/') | |
#Store dates to output | |
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT | |
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT | |
##Compare the dates and set variable if date_new is more recent | |
timestamp1=$(date -d "$date_new" +%s) | |
timestamp2=$(date -d "$date_old" +%s) | |
if [ "$timestamp1" -gt "$timestamp2" ]; then | |
echo 'New release available', "$release" | |
echo "NEW_RELEASE=true" >> $GITHUB_OUTPUT | |
else | |
echo 'No new release available' | |
fi | |
echo "Date of latest release: $date_new" | |
echo "Date of release of the current version: $date_old" | |
test_new_data_processing: | |
name: Processing new data and check updates | |
needs: check_new_data | |
env: | |
DATE_OLD: ${{ needs.check_new_data.outputs.DATE_OLD }} | |
DATE_NEW: ${{ needs.check_new_data.outputs.DATE_NEW }} | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
# step 2: download the recent data | |
- name: Download the recent data | |
run: | | |
sudo apt-get install xml-twig-tools | |
##Store outputs from previous job in environment variables | |
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV | |
##Download hmdb file | |
wget http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip | |
unzip hmdb_metabolites.zip | |
mkdir hmdb | |
mv hmdb_metabolites.xml hmdb | |
cd hmdb | |
xml_split -v -l 1 hmdb_metabolites.xml | |
rm hmdb_metabolites.xml | |
cd ../ | |
zip -r hmdb_metabolites_split.zip hmdb | |
# step 3: run the jar for hmdb preprocessing | |
- name: Set up Java | |
uses: actions/setup-java@v2 | |
with: | |
java-version: '11' | |
distribution: 'adopt' | |
- name: Test XML processing | |
id: sdf_process | |
run: | | |
# Set up vars from config file | |
to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/hmdb/config | cut -d'=' -f2) | |
inputFile=hmdb_metabolites_split.zip | |
mkdir datasources/hmdb/recentData/ | |
outputDir="datasources/hmdb/recentData/" | |
# Run Java program and capture its exit code | |
java -cp java/target/mapping_prerocessing-0.0.1-jar-with-dependencies.jar org.sec2pri.hmdb_xml "$inputFile" "$outputDir" | |
# Check the exit status of the Java program | |
if [ $? -eq 0 ]; then | |
# Java program succeeded | |
echo "Successful preprocessing of ChEBI data." | |
echo "FAILED=false" >> $GITHUB_ENV | |
else | |
# Java program failed | |
echo "Failed preprocessing of ChEBI data." | |
echo "FAILED=true" >> $GITHUB_ENV | |
fi | |
# step 4: compare the new and old data | |
- name: Diff versions | |
if: | |
${{ env.FAILED == 'false' }} | |
run: | | |
# Set up vars from config file | |
to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/hmdb/config | cut -d'=' -f2) | |
old="datasources/hmdb/data/$to_check_from_zenodo" | |
new="datasources/hmdb/recentData/$to_check_from_zenodo" | |
# remove headers | |
sed -i '1d' "$new" | |
sed -i '1d' "$old" | |
# qc integrity of IDs | |
wget -nc https://raw.githubusercontent.com/bridgedb/datasources/main/datasources.tsv | |
HMDB_ID=$(awk -F '\t' '$1 == "HMDB" {print $10}' datasources.tsv) | |
# Split the file into two separate files for each column | |
awk -F '\t' '{print $1}' $new > column1.txt | |
awk -F '\t' '{print $2}' $new > column2.txt | |
# Use grep to check if any line in the primary column doesn't match the pattern | |
if grep -nqvE "$HMDB_ID" "column1.txt"; then | |
echo "All lines in the primary column match the pattern." | |
else | |
echo "Error: At least one line in the primary column does not match pattern." | |
grep -nvE "^$HMDB_ID$" "column1.txt" | |
echo "FAILED=true" >> $GITHUB_ENV | |
exit 1 | |
fi | |
# Use grep to check if any line in the secondary column doesn't match the pattern | |
if grep -nqvE "$HMDB_ID" "column1.txt"; then | |
echo "All lines in the secondary column match the pattern." | |
else | |
echo "Error: At least one line in the secondary column does not match pattern." | |
grep -nqvE "$HMDB_ID" "column2.txt" | |
echo "FAILED=true" >> $GITHUB_ENV | |
exit 1 | |
fi | |
# sort them | |
cat "$old" | sort | tr -d "\r" > ids_old.txt | |
cat "$new" | sort | tr -d "\r" > ids_new.txt | |
echo "Performing diff between the sorted lists of Symbols" | |
# Perform a diff between the sorted lists of Symbols | |
output_file=diff.txt | |
diff -u ids_old.txt ids_new.txt > $output_file || true | |
# retrieve new lines | |
added=$(grep '^+' "$output_file" | sed 's/+//g') || true | |
# retrieve removed lines | |
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true | |
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed")) | |
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added")) | |
added=$added_filtered | |
removed=$removed_filtered | |
# count them | |
count_removed=$(printf "$removed" | wc -l) || true | |
count_added=$(printf "$added" | wc -l) || true | |
# make sure we are not counting empty lines | |
if [ -z "$removed" ]; then | |
count_removed=0 | |
removed="None" | |
fi | |
if [ -z "$added" ]; then | |
count_added=0 | |
added="None" | |
fi | |
echo ________________________________________________ | |
echo " removed pairs " | |
echo ________________________________________________ | |
echo "$removed" | |
echo ________________________________________________ | |
echo " added pairs " | |
echo ________________________________________________ | |
echo "$added" | |
echo _________________________________________________ | |
echo "What's changed:" | |
echo "- Added id pairs: $count_added" | |
echo "- Removed id pairs: $count_removed" | |
# Store to env to use in issue | |
echo "ADDED=$count_added" >> $GITHUB_ENV | |
echo "REMOVED=$count_removed" >> $GITHUB_ENV | |
count=$(expr $count_added + $count_removed) || true | |
echo "COUNT=$count" >> $GITHUB_ENV | |
total_old=$(cat "$old" | wc -l) || true | |
change=$((100 * count / total_old)) | |
echo "CHANGE=$change" >> $GITHUB_ENV | |
- name: 'Upload processed data as artifacts' | |
uses: actions/upload-artifact@v3 | |
with: | |
name: hmdb_processed | |
path: datasources/hmdb/recentData/* | |
- uses: JasonEtco/create-an-issue@v2 | |
if: env.COUNT != 0 | |
name: Post issue about update availability | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: "HMDB" | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md | |
update_existing: true | |
- uses: JasonEtco/create-an-issue@v2 | |
name: Post issue about failing test | |
if: ${{ env.FAILED == 'true' }} | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: hmdb | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md | |
update_existing: true |