Check and test HGNC updates #62
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Check and test HGNC updates | |
on: | |
workflow_dispatch: | |
pull_request: # tests whether it is working on PR | |
paths: | |
- '.github/workflows/hgnc.yml' | |
schedule: | |
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month | |
permissions: | |
contents: write | |
pages: write | |
id-token: write | |
issues: write | |
jobs: | |
check_new_data: | |
runs-on: ubuntu-latest | |
name: Check the date of the latest data | |
outputs: | |
COMPLETE_NEW: ${{ steps.check_download.outputs.COMPLETE_NEW }} | |
WITHDRAWN_NEW: ${{ steps.check_download.outputs.WITHDRAWN_NEW }} | |
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }} | |
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }} | |
steps: | |
# step 1: check the release date for the latest HGNC files | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Check for new hgnc files | |
id: check_download | |
run: | | |
##Extract the date from the hgnc config file | |
date_old=$(grep -E '^date=' datasources/hgnc/config | cut -d'=' -f2) | |
echo 'Accessing the hgnc data' | |
wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/ -O hgnc_index.html | |
# Extracting the latest complete file | |
complete=$(grep -o 'hgnc_complete_set_[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}\.txt' hgnc_index.html | tail -n 1) | |
# Extracting the latest withdrawn file | |
withdrawn=$(grep -o 'withdrawn_[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}\.txt' hgnc_index.html | tail -n 1) | |
date_new=$(echo "$complete" | awk -F '_' '{print $4}' | sed 's/\.txt//') | |
#Store dates to output | |
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT | |
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT | |
echo "COMPLETE_NEW=$complete" >> $GITHUB_OUTPUT | |
echo "WITHDRAWN_NEW=$withdrawn" >> $GITHUB_OUTPUT | |
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old" | |
test_new_data_processing: | |
name: Processing new data and check updates | |
needs: check_new_data | |
env: | |
DATE_OLD: ${{ needs.check_new_data.outputs.DATE_OLD }} | |
DATE_NEW: ${{ needs.check_new_data.outputs.DATE_NEW }} | |
COMPLETE_NEW: ${{ needs.check_new_data.outputs.COMPLETE_NEW }} | |
WITHDRAWN_NEW: ${{ needs.check_new_data.outputs.WITHDRAWN_NEW }} | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
# step 2: download the recent data | |
- name: Download the recent data | |
run: | | |
##Store outputs from previous job in environment variables | |
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV | |
echo "$COMPLETE_NEW=$COMPLETE_NEW" >> $GITHUB_ENV | |
echo "$WITHDRAWN_NEW=$WITHDRAWN_NEW" >> $GITHUB_ENV | |
##Create temp. folder to store the data in | |
mkdir -p datasources/hgnc/data | |
##Download hgnc file | |
wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/${WITHDRAWN_NEW} | |
wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/${COMPLETE_NEW} | |
mv $WITHDRAWN_NEW $COMPLETE_NEW datasources/hgnc/data | |
##Check file size if available | |
ls -trlh datasources/hgnc/data | |
# step 3: run the Rscripts for hgnc preprocessing | |
- name: Install R | |
uses: r-lib/actions/setup-r@v2 | |
with: | |
r-version: '4.1' # Specify the R version | |
include-recommended: true | |
- name: Test hgnc data processing | |
id: hgnc_process | |
run: | | |
sourceVersion=$DATE_NEW | |
complete="datasources/hgnc/data/${COMPLETE_NEW}" | |
withdrawn="datasources/hgnc/data/${WITHDRAWN_NEW}" | |
# Run rscripts program and capture its exit code | |
Rscript r/src/hgnc.R $sourceVersion $withdrawn $complete | |
# Check the exit status of the R script | |
if [ $? -eq 0 ]; then | |
# script succeeded | |
echo "Successful preprocessing of hgnc data." | |
echo "FAILED=false" >> $GITHUB_ENV | |
else | |
# script failed | |
echo "Failed preprocessing of hgnc data." | |
echo "FAILED=true" >> $GITHUB_ENV | |
fi | |
# step 4: compare the new and old data | |
- name: Diff versions | |
if: | |
${{ env.FAILED == 'false' }} | |
run: | | |
# Set up vars from config file | |
to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/hgnc/config | cut -d'=' -f2) | |
old="datasources/hgnc/data/$to_check_from_zenodo" | |
new="datasources/hgnc/recentData/$to_check_from_zenodo" | |
column_name="secondaryID" | |
# Extract the primaryID and secondaryID column from both files and sort them | |
cat "$old" | sort | tr -d "\r" | cut -f 1,3 > ids_old.txt | |
cat "$new" | sort | tr -d "\r" | cut -f 1,3 > ids_new.txt | |
# TODO decide whether to perform diff on symbols too? | |
# Extract the secondarySymbol column from both files and sort them | |
##cat "$old" | sort | tr -d "\r" | cut -f 4 > symbols_old.txt | |
##cat "$new" | sort | tr -d "\r" | cut -f 4 > symbols_new.txt | |
echo "Performing diff between the sorted lists of IDs" | |
# Perform a diff between the sorted lists of IDs | |
output_file=diff.txt | |
diff -u ids_old.txt ids_new.txt > $output_file || true | |
less $output_file | head | |
# retrieve new lines | |
added=$(grep '^+' "$output_file" | sed 's/+//g') || true | |
# retrieve removed lines | |
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true | |
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed")) | |
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added")) | |
added=$added_filtered | |
removed=$removed_filtered | |
# count them | |
count_removed=$(printf "$removed" | wc -l) || true | |
count_added=$(printf "$added" | wc -l) || true | |
# make sure we are not counting empty lines | |
if [ -z "$removed" ]; then | |
count_removed=0 | |
removed="None" | |
fi | |
if [ -z "$added" ]; then | |
count_added=0 | |
added="None" | |
fi | |
echo ________________________________________________ | |
echo " removed pairs " | |
echo ________________________________________________ | |
echo "$removed" | |
echo ________________________________________________ | |
echo " added pairs " | |
echo ________________________________________________ | |
echo "$added" | |
echo _________________________________________________ | |
echo "What's changed:" | |
echo "- Added id pairs: $count_added" | |
echo "- Removed id pairs: $count_removed" | |
# Store to env to use in issue | |
echo "ADDED=$count_added" >> $GITHUB_ENV | |
echo "REMOVED=$count_removed" >> $GITHUB_ENV | |
count=$(expr $count_added + $count_removed) || true | |
echo "COUNT=$count" >> $GITHUB_ENV | |
total_old=$(cat "$old" | wc -l) || true | |
change=$((100 * count / total_old)) | |
echo "CHANGE=$change" >> $GITHUB_ENV | |
- name: 'Upload processed data as artifacts' | |
uses: actions/upload-artifact@v3 | |
with: | |
name: hgnc_processed | |
path: datasources/hgnc/recentData/* | |
- uses: JasonEtco/create-an-issue@v2 | |
if: env.COUNT != 0 | |
name: Post issue about update availability | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: "HGNC" | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md | |
update_existing: true | |
- uses: JasonEtco/create-an-issue@v2 | |
name: Post issue about failing test | |
if: ${{ env.FAILED == 'true' }} | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: "HGNC" | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md | |
update_existing: true |