Skip to content

Check and test UniProt updates #36

Check and test UniProt updates

Check and test UniProt updates #36

Workflow file for this run

name: Check and test UniProt updates
on:
workflow_dispatch:
pull_request: # tests whether it is working on PR
paths:
- '.github/workflows/uniprot.yml'
schedule:
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
permissions:
contents: write
pages: write
id-token: write
issues: write
jobs:
check_new_data:
runs-on: ubuntu-latest
name: Check the date of the latest data
outputs:
UNIPROT_SPROT_NEW: ${{ steps.check_download.outputs.UNIPROT_SPROT_NEW }}
SEC_AC_NEW: ${{ steps.check_download.outputs.SEC_AC_NEW }}
DELAC_SP_NEW: ${{ steps.check_download.outputs.DELAC_SP_NEW }}
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }}
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }}
steps:
# step 1: check the release date for the latest UniProt files
- name: Checkout
uses: actions/checkout@v4
- name: Check for new uniprot files
id: check_download
run: |
##Extract the date from the uniprot config file
date_old=$(grep -E '^date=' datasources/uniprot/config | cut -d'=' -f2)
echo 'Accessing the uniprot data'
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/ -O uniprot_index.html
less uniprot_index.html
date_new=$(grep -oP 'uniprot_sprot\.fasta\.gz</a></td><td[^>]*>\K[0-9]{4}-[0-9]{2}-[0-9]{2}' uniprot_index.html)
#Store dates to output
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old"
test_new_data_processing:
name: Processing new data and check updates
needs: check_new_data
env:
DATE_OLD: ${{ needs.check_new_data.outputs.DATE_OLD }}
DATE_NEW: ${{ needs.check_new_data.outputs.DATE_NEW }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
# step 2: download the recent data
- name: Download the recent data
run: |
##Store outputs from previous job in environment variables
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV
##Define files to be downloaded
UNIPROT_SPROT_NEW=$(echo uniprot_sprot.fasta.gz)
SEC_AC_NEW=$(echo sec_ac.txt)
DELAC_SP_NEW=$(echo delac_sp.txt)
##Create temp. folder to store the data in
mkdir -p datasources/uniprot/data
##Download uniprot file
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/${UNIPROT_SPROT_NEW}
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/docs/${SEC_AC_NEW}
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/docs/${DELAC_SP_NEW}
mv $DELAC_SP_NEW $SEC_AC_NEW $UNIPROT_SPROT_NEW datasources/uniprot/data
##Check file size if available
ls -trlh datasources/uniprot/data
# step 3: run the Rscripts for uniprot preprocessing
- name: Install R
uses: r-lib/actions/setup-r@v2
with:
r-version: '4.1' # Specify the R version
include-recommended: true
- name: Test uniprot data processing
id: uniprot_process
run: |
sourceVersion=$DATE_NEW
uniprot_sprot=$(echo datasources/uniprot/data/uniprot_sprot.fasta.gz)
sec_ac=$(echo datasources/uniprot/data/sec_ac.txt)
delac_sp=$(echo datasources/uniprot/data/delac_sp.txt)
# Run rscripts program and capture its exit code
Rscript r/src/uniprot.R $sourceVersion $uniprot_sprot $delac_sp $sec_ac
# Check the exit status of the R script
if [ $? -eq 0 ]; then
# script succeeded
echo "Successful preprocessing of uniprot data."
echo "FAILED=false" >> $GITHUB_ENV
else
# script failed
echo "Failed preprocessing of uniprot data."
echo "FAILED=true" >> $GITHUB_ENV
fi
# step 4: compare the new and old data
- name: Diff versions
if:
${{ env.FAILED == 'false' }}
run: |
# Set up vars from config file
to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/uniprot/config | cut -d'=' -f2)
old="datasources/uniprot/data/$to_check_from_zenodo"
new="datasources/uniprot/recentData/$to_check_from_zenodo"
column_name="secondaryID"
# Unzip the zip file
unzip datasources/uniprot/data/UniProt_secID2priID.zip -d datasources/uniprot/data/
# Extract the primaryID and secondaryID column from both files and sort them
cat "$old" | sort | tr -d "\r" | cut -f 1,2 > ids_old.txt
cat "$new" | sort | tr -d "\r" | cut -f 1,2 > ids_new.txt
# TODO decide whether to perform diff on symbols too?
# Extract the secondarySymbol column from both files and sort them
##cat "$old" | sort | tr -d "\r" | cut -f 4 > symbols_old.txt
##cat "$new" | sort | tr -d "\r" | cut -f 4 > symbols_new.txt
echo "Performing diff between the sorted lists of IDs"
# Perform a diff between the sorted lists of IDs
output_file=diff.txt
diff -u ids_old.txt ids_new.txt > $output_file || true
less $output_file | head
# retrieve new lines
echo Counting added lines...
added=$(grep '^+' "$output_file" | sed 's/+//g') || true
# retrieve removed lines
echo Counting removed lines...
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
added=$added_filtered
removed=$removed_filtered
# count them
echo Counting differences:
count_removed=$(printf "$removed" | wc -l) || true
count_added=$(printf "$added" | wc -l) || true
# make sure we are not counting empty lines
if [ -z "$removed" ]; then
count_removed=0
removed="None"
fi
if [ -z "$added" ]; then
count_added=0
added="None"
fi
echo ________________________________________________
echo " removed pairs "
echo ________________________________________________
echo "$removed"
echo ________________________________________________
echo " added pairs "
echo ________________________________________________
echo "$added"
echo _________________________________________________
echo "What's changed:"
echo "- Added id pairs: $count_added"
echo "- Removed id pairs: $count_removed"
# Store to env to use in issue
echo "ADDED=$count_added" >> $GITHUB_ENV
echo "REMOVED=$count_removed" >> $GITHUB_ENV
count=$(expr $count_added + $count_removed) || true
echo "COUNT=$count" >> $GITHUB_ENV
total_old=$(cat "$old" | wc -l) || true
change=$((100 * count / total_old))
echo "CHANGE=$change" >> $GITHUB_ENV
- name: 'Upload processed data as artifacts'
uses: actions/upload-artifact@v3
with:
name: uniprot_processed
path: datasources/uniprot/recentData/*
# step 5: post issues
- uses: JasonEtco/create-an-issue@v2
if: env.COUNT != 0
name: Post issue about update availability
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: "UniProt"
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
update_existing: true
- uses: JasonEtco/create-an-issue@v2
name: Post issue about failing test
if: ${{ env.FAILED == 'true' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: "UniProt"
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
update_existing: true