Check and test UniProt updates #29
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Check and test UniProt updates | |
on: | |
workflow_dispatch: | |
pull_request: # tests whether it is working on PR | |
paths: | |
- '.github/workflows/uniprot.yml' | |
schedule: | |
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month | |
permissions: | |
contents: write | |
pages: write | |
id-token: write | |
issues: write | |
jobs: | |
check_new_data: | |
runs-on: ubuntu-latest | |
name: Check the date of the latest data | |
outputs: | |
UNIPROT_SPROT_NEW: ${{ steps.check_download.outputs.UNIPROT_SPROT_NEW }} | |
SEC_AC_NEW: ${{ steps.check_download.outputs.SEC_AC_NEW }} | |
DELAC_SP_NEW: ${{ steps.check_download.outputs.DELAC_SP_NEW }} | |
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }} | |
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }} | |
steps: | |
# step 1: check the release date for the latest UniProt files | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Check for new uniprot files | |
id: check_download | |
run: | | |
##Extract the date from the uniprot config file | |
date_old=$(grep -E '^date=' datasources/uniprot/config | cut -d'=' -f2) | |
echo 'Accessing the uniprot data' | |
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/ -O uniprot_index.html | |
less uniprot_index.html | |
date_new=$(grep -oP 'uniprot_sprot\.fasta\.gz</a></td><td[^>]*>\K[0-9]{4}-[0-9]{2}-[0-9]{2}' uniprot_index.html) | |
#Store dates to output | |
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT | |
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT | |
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old" | |
test_new_data_processing: | |
name: Processing new data and check updates | |
needs: check_new_data | |
outputs: | |
ARTIFACT_ID: ${{ steps.artifact-upload.outputs.artifact-id }} | |
env: | |
DATE_OLD: ${{ needs.check_new_data.outputs.DATE_OLD }} | |
DATE_NEW: ${{ needs.check_new_data.outputs.DATE_NEW }} | |
runs-on: ubuntu-latest | |
steps: | |
- name: Checkout repository | |
uses: actions/checkout@v4 | |
# step 2: download the recent data | |
- name: Download the recent data | |
run: | | |
##Store outputs from previous job in environment variables | |
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV | |
##Define files to be downloaded | |
UNIPROT_SPROT_NEW=$(echo uniprot_sprot.fasta.gz) | |
SEC_AC_NEW=$(echo sec_ac.txt) | |
DELAC_SP_NEW=$(echo delac_sp.txt) | |
##Create temp. folder to store the data in | |
mkdir -p datasources/uniprot/data | |
##Download uniprot file | |
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/${UNIPROT_SPROT_NEW} | |
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/docs/${SEC_AC_NEW} | |
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/docs/${DELAC_SP_NEW} | |
mv $DELAC_SP_NEW $SEC_AC_NEW $UNIPROT_SPROT_NEW datasources/uniprot/data | |
##Check file size if available | |
ls -trlh datasources/uniprot/data | |
# step 3: run the Rscripts for uniprot preprocessing | |
- name: Install R | |
uses: r-lib/actions/setup-r@v2 | |
with: | |
r-version: '4.1' # Specify the R version | |
include-recommended: true | |
- name: Test uniprot data processing | |
id: uniprot_process | |
run: | | |
sourceVersion=$DATE_NEW | |
uniprot_sprot=$(echo datasources/uniprot/data/uniprot_sprot.fasta.gz) | |
sec_ac=$(echo datasources/uniprot/data/sec_ac.txt) | |
delac_sp=$(echo datasources/uniprot/data/delac_sp.txt) | |
# Run rscripts program and capture its exit code | |
Rscript r/src/uniprot.R $sourceVersion $uniprot_sprot $delac_sp $sec_ac | |
# Check the exit status of the R script | |
if [ $? -eq 0 ]; then | |
# script succeeded | |
echo "Successful preprocessing of uniprot data." | |
echo "FAILED=false" >> $GITHUB_ENV | |
else | |
# script failed | |
echo "Failed preprocessing of uniprot data." | |
echo "FAILED=true" >> $GITHUB_ENV | |
fi | |
# step 4: compare the new and old data | |
- name: Diff versions | |
if: | |
${{ env.FAILED == 'false' }} | |
run: | | |
# Set up vars from config file | |
to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/uniprot/config | cut -d'=' -f2) | |
old="datasources/uniprot/data/$to_check_from_zenodo" | |
new="datasources/uniprot/recentData/$to_check_from_zenodo" | |
column_name="secondaryID" | |
# Unzip the zip file | |
unzip datasources/uniprot/data/UniProt_secID2priID.zip -d datasources/uniprot/data/ | |
# Extract the primaryID and secondaryID column from both files and sort them | |
cat "$old" | sort | tr -d "\r" | cut -f 1,2 > ids_old.txt | |
cat "$new" | sort | tr -d "\r" | cut -f 1,2 > ids_new.txt | |
# TODO decide whether to perform diff on symbols too? | |
# Extract the secondarySymbol column from both files and sort them | |
##cat "$old" | sort | tr -d "\r" | cut -f 4 > symbols_old.txt | |
##cat "$new" | sort | tr -d "\r" | cut -f 4 > symbols_new.txt | |
echo "Performing diff between the sorted lists of IDs" | |
# Perform a diff between the sorted lists of IDs | |
output_file=diff.txt | |
diff -u ids_old.txt ids_new.txt > $output_file || true | |
less $output_file | head | |
# retrieve new lines | |
echo Counting added lines... | |
added=$(grep '^+' "$output_file" | sed 's/+//g') || true | |
# retrieve removed lines | |
echo Counting removed lines... | |
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true | |
# Create temporary files | |
tmp_added=$(mktemp) | |
tmp_withdrawn=$(mktemp) | |
tmp_removed=$(mktemp) | |
# Write the content of the added variable to the temporary file | |
echo "$added" > "$tmp_added" | |
# Retrieve matches and store them in another temporary file | |
grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true | |
# Append matches to the removed variable | |
if [ -n "$removed" ]; then | |
echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed" | |
else | |
cat "$tmp_withdrawn" > "$tmp_removed" | |
fi | |
# Remove matching lines from the added variable | |
sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added" | |
# Read the updated content back into the variables | |
added=$(cat "$tmp_added") | |
removed=$(cat "$tmp_removed") | |
# Clean up temporary files | |
rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed" | |
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed")) | |
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added")) | |
added=$added_filtered | |
removed=$removed_filtered | |
# count them | |
echo Counting differences: | |
count_removed=$(printf "$removed" | wc -l) || true | |
count_added=$(printf "$added" | wc -l) || true | |
# make sure we are not counting empty lines | |
if [ -z "$removed" ]; then | |
count_removed=0 | |
removed="None" | |
fi | |
if [ -z "$added" ]; then | |
count_added=0 | |
added="None" | |
fi | |
echo ________________________________________________ | |
echo " removed pairs " | |
echo ________________________________________________ | |
echo "$removed" | |
echo ________________________________________________ | |
echo " added pairs " | |
echo ________________________________________________ | |
echo "$added" | |
echo _________________________________________________ | |
echo "What's changed:" | |
echo "- Added id pairs: $count_added" | |
echo "- Removed id pairs: $count_removed" | |
# Store to env to use in issue | |
echo "ADDED=$count_added" >> $GITHUB_ENV | |
echo "REMOVED=$count_removed" >> $GITHUB_ENV | |
count=$(expr $count_added + $count_removed) || true | |
echo "COUNT=$count" >> $GITHUB_ENV | |
total_old=$(cat "$old" | wc -l) || true | |
change=$((100 * count / total_old)) | |
echo "CHANGE=$change" >> $GITHUB_ENV | |
- name: 'Upload processed data as artifacts' | |
id: artifact-upload | |
uses: actions/upload-artifact@v4 | |
with: | |
name: uniprot_processed | |
path: datasources/uniprot/recentData/* | |
# step 5: post issues | |
- uses: JasonEtco/create-an-issue@v2 | |
if: env.COUNT != 0 | |
name: Post issue about update availability | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: "UniProt" | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md | |
update_existing: true | |
- uses: JasonEtco/create-an-issue@v2 | |
name: Post issue about failing test | |
if: ${{ env.FAILED == 'true' }} | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: "UniProt" | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md | |
update_existing: true | |
- name: Trigger docker release | |
uses: peter-evans/repository-dispatch@v3 | |
if: ${{ env.COUNT != 0 }} | |
with: | |
token: ${{ secrets.PING }} | |
repository: sec2pri/omicsFixID | |
event-type: update-event | |
client-payload: > | |
{ | |
"ref": "${{ github.ref }}", | |
"sha": "${{ github.sha }}", | |
"datasource": "UniProt", | |
"version": "NA", | |
"url": "${{ env.URL_RELEASE }}", | |
"processed_data": "https://github.com/sec2pri/mapping_preprocessing/actions/runs/${{ github.run_id }}/artifacts/${{ steps.artifact-upload.outputs.artifact-id }}" | |
} |