Check and test HMDB updates #42

Workflow file for this run

	name: Check and test HMDB updates

	on:
	workflow_dispatch:
	pull_request: # tests whether it is working on PR
	paths:
	- '.github/workflows/hmdb.yml'
	schedule:
	- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
	permissions:
	contents: write
	pages: write
	id-token: write
	issues: write


	jobs:
	check_new_data:
	runs-on: ubuntu-latest
	name: Check the date of the latest data
	outputs:
	DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }}
	DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }}
	NEW_RELEASE: ${{ steps.check_download.outputs.NEW_RELEASE }}
	steps:
	# step 1: check the release date for the latest hmdb files
	- name: Checkout
	uses: actions/checkout@v4
	- name: Check for new hmdb files
	id: check_download
	run: \|

	## Setup
	sudo apt-get install xml-twig-tools

	##Extract the date from the hmdb config file
	date_old=$(grep -E '^date=' datasources/hmdb/config \| cut -d'=' -f2)
	echo 'Accessing the hmdb data'
	wget http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip
	unzip hmdb_metabolites.zip
	head hmdb_metabolites.xml
	head hmdb_metabolites.xml \| grep 'update_date' \| sed 's/.>$[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}$./\1/'
	date_new=$(head hmdb_metabolites.xml \| grep 'update_date' \| sed 's/.>$[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}$./\1/')
	#Store dates to output
	echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT
	echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT
	##Compare the dates and set variable if date_new is more recent
	timestamp1=$(date -d "$date_new" +%s)
	timestamp2=$(date -d "$date_old" +%s)
	if [ "$timestamp1" -gt "$timestamp2" ]; then
	echo 'New release available', "$release"
	echo "NEW_RELEASE=true" >> $GITHUB_OUTPUT
	else
	echo 'No new release available'
	fi
	echo "Date of latest release: $date_new"
	echo "Date of release of the current version: $date_old"


	test_new_data_processing:
	name: Processing new data and check updates
	needs: check_new_data
	outputs:
	ARTIFACT_ID: ${{ steps.artifact-upload.outputs.artifact-id }}
	env:
	DATE_OLD: ${{ needs.check_new_data.outputs.DATE_OLD }}
	DATE_NEW: ${{ needs.check_new_data.outputs.DATE_NEW }}
	runs-on: ubuntu-latest
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	# step 2: download the recent data
	- name: Download the recent data
	run: \|
	sudo apt-get install xml-twig-tools
	##Store outputs from previous job in environment variables
	echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV
	##Download hmdb file
	wget http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip
	unzip hmdb_metabolites.zip
	mkdir hmdb
	mv hmdb_metabolites.xml hmdb
	cd hmdb
	xml_split -v -l 1 hmdb_metabolites.xml
	rm hmdb_metabolites.xml
	cd ../
	zip -r hmdb_metabolites_split.zip hmdb


	# step 3: run the jar for hmdb preprocessing
	- name: Set up Java
	uses: actions/setup-java@v2
	with:
	java-version: '11'
	distribution: 'adopt'

	- name: Test XML processing
	id: sdf_process
	run: \|
	# Set up vars from config file
	to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/hmdb/config \| cut -d'=' -f2)
	inputFile=hmdb_metabolites_split.zip
	mkdir datasources/hmdb/recentData/
	outputDir="datasources/hmdb/recentData/"
	# Run Java program and capture its exit code
	java -cp java/target/mapping_prerocessing-0.0.1-jar-with-dependencies.jar org.sec2pri.hmdb_xml "$inputFile" "$outputDir"
	# Check the exit status of the Java program
	if [ $? -eq 0 ]; then
	# Java program succeeded
	echo "Successful preprocessing of ChEBI data."
	echo "FAILED=false" >> $GITHUB_ENV
	else
	# Java program failed
	echo "Failed preprocessing of ChEBI data."
	echo "FAILED=true" >> $GITHUB_ENV
	fi
	# step 4: compare the new and old data
	- name: Diff versions
	if:
	${{ env.FAILED == 'false' }}
	run: \|
	# Set up vars from config file
	to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/hmdb/config \| cut -d'=' -f2)
	old="datasources/hmdb/data/$to_check_from_zenodo"
	new="datasources/hmdb/recentData/$to_check_from_zenodo"
	# remove headers
	sed -i '1d' "$new"
	sed -i '1d' "$old"
	# qc integrity of IDs
	wget -nc https://raw.githubusercontent.com/bridgedb/datasources/main/datasources.tsv
	HMDB_ID=$(awk -F '\t' '$1 == "HMDB" {print $10}' datasources.tsv)
	# Split the file into two separate files for each column
	awk -F '\t' '{print $1}' $new > column1.txt
	awk -F '\t' '{print $2}' $new > column2.txt

	# Use grep to check if any line in the primary column doesn't match the pattern
	if grep -nqvE "$HMDB_ID" "column1.txt"; then
	echo "All lines in the primary column match the pattern."
	else
	echo "Error: At least one line in the primary column does not match pattern."
	grep -nvE "^$HMDB_ID$" "column1.txt"
	echo "FAILED=true" >> $GITHUB_ENV
	exit 1
	fi

	# Use grep to check if any line in the secondary column doesn't match the pattern
	if grep -nqvE "$HMDB_ID" "column1.txt"; then
	echo "All lines in the secondary column match the pattern."

	else
	echo "Error: At least one line in the secondary column does not match pattern."
	grep -nqvE "$HMDB_ID" "column2.txt"
	echo "FAILED=true" >> $GITHUB_ENV
	exit 1
	fi
	# sort them
	cat "$old" \| sort \| tr -d "\r" > ids_old.txt
	cat "$new" \| sort \| tr -d "\r" > ids_new.txt

	echo "Performing diff between the sorted lists of Symbols"

	# Perform a diff between the sorted lists of Symbols
	output_file=diff.txt

	diff -u ids_old.txt ids_new.txt > $output_file \|\| true
	# retrieve new lines
	added=$(grep '^+' "$output_file" \| sed 's/+//g') \|\| true
	# retrieve removed lines
	removed=$(grep '^-' "$output_file" \| sed 's/-//g') \|\| true
	# Create temporary files
	tmp_added=$(mktemp)
	tmp_withdrawn=$(mktemp)
	tmp_removed=$(mktemp)
	# Write the content of the added variable to the temporary file
	echo "$added" > "$tmp_added"
	# Retrieve matches and store them in another temporary file
	grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" \|\| true
	# Append matches to the removed variable
	if [ -n "$removed" ]; then
	echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
	else
	cat "$tmp_withdrawn" > "$tmp_removed"
	fi
	# Remove matching lines from the added variable
	sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added"
	# Read the updated content back into the variables
	added=$(cat "$tmp_added")
	removed=$(cat "$tmp_removed")
	# Clean up temporary files
	rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
	added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
	removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
	added=$added_filtered
	removed=$removed_filtered
	# count them
	count_removed=$(printf "$removed" \| wc -l) \|\| true
	count_added=$(printf "$added" \| wc -l) \|\| true

	# make sure we are not counting empty lines
	if [ -z "$removed" ]; then
	count_removed=0
	removed="None"
	fi
	if [ -z "$added" ]; then
	count_added=0
	added="None"
	fi
	echo ________________________________________________
	echo " removed pairs "
	echo ________________________________________________
	echo "$removed"
	echo ________________________________________________
	echo " added pairs "
	echo ________________________________________________
	echo "$added"
	echo _________________________________________________
	echo "What's changed:"
	echo "- Added id pairs: $count_added"
	echo "- Removed id pairs: $count_removed"
	# Store to env to use in issue
	echo "ADDED=$count_added" >> $GITHUB_ENV
	echo "REMOVED=$count_removed" >> $GITHUB_ENV
	count=$(expr $count_added + $count_removed) \|\| true
	echo "COUNT=$count" >> $GITHUB_ENV
	total_old=$(cat "$old" \| wc -l) \|\| true
	change=$((100 * count / total_old))
	echo "CHANGE=$change" >> $GITHUB_ENV

	- name: 'Upload processed data as artifacts'
	id: artifact-upload
	uses: actions/upload-artifact@v4
	with:
	name: hmdb_processed
	path: datasources/hmdb/recentData/*


	- uses: JasonEtco/create-an-issue@v2
	if: env.COUNT != 0
	name: Post issue about update availability
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	SOURCE: "HMDB"
	with:
	filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
	update_existing: true

	- uses: JasonEtco/create-an-issue@v2
	name: Post issue about failing test
	if: ${{ env.FAILED == 'true' }}
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	SOURCE: hmdb
	with:
	filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
	update_existing: true

	- name: Trigger docker release
	uses: peter-evans/repository-dispatch@v3
	if: ${{ env.COUNT != 0 }}
	env:
	RELEASE_DATE: ${{ steps.check_download.outputs.NEW_RELEASE }}
	with:
	token: ${{ secrets.PING }}
	repository: sec2pri/omicsFixID
	event-type: update-event
	client-payload: >
	{
	"ref": "${{ github.ref }}",
	"sha": "${{ github.sha }}",
	"datasource": "HMDB",
	"version": "${{ env.RELEASE_NUMBER }}",
	"processed_data": "https://github.com/sec2pri/mapping_preprocessing/actions/runs/${{ github.run_id }}/artifacts/${{ steps.artifact-upload.outputs.artifact-id }}",
	""date": "${{ env.RELEASE_DATE }}"
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Check and test HMDB updates #42

Workflow file

Check and test HMDB updates #42

Jobs

Run details

Workflow file for this run