Skip to content

wikidata

wikidata #144

Workflow file for this run

# Workflow for downloading and saving Wikidata secondary2primary mappings
name: wikidata
on:
workflow_dispatch:
pull_request: # tests whether it is working on PR
paths:
- '.github/workflows/wikidata.yml'
schedule:
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
jobs:
query_wikidata:
runs-on: ubuntu-latest
outputs:
TIMEOUT: ${{ steps.queries.outputs.TIMEOUT }}
permissions:
contents: write
# step 1: checkout the repository
steps:
- name: Checkout
uses: actions/checkout@v4
# step 2: run the SPARQL queries from the Wikidata query subfolder
- name: Run the Queries
id: queries
run: |
##Make directory if not existing already
mkdir datasources/wikidata/results
##Define variable to be used in storing and updating output data (to avoid hardcoding for each change) (tba)
## Download outdated IDs for chemicals Wikidata Style
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_secID2priID.tsv
## Download outdated IDs for chemicals qLever Style
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://qlever.cs.uni-freiburg.de/api/wikidata -o datasources/wikidata/results/metabolites_secID2priID_qlever.tsv
## Download all primary IDs for chemicals
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_priIDs.tsv
## Download alias/synonyms/names for chemicals
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_name2synonym.tsv
## Download outdated IDs for genes (split from proteins to avoid timeouts)
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/gene_secID2priID.tsv
## Download outdated IDs for proteins (split from genes to avoid timeouts)
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/proteinHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/protein_secID2priID.tsv
## Download all primary IDs for genes and proteins
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_priIDs.tsv
## Download alias/synonyms/names for genes and proteins
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_name2synonym.tsv
##Concatenate gene and protein outdated ID data
head -n 1 datasources/wikidata/results/gene_secID2priID.tsv > datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add the header of one file as start
tail -n +2 -q datasources/wikidata/results/gene_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add gene sec. IDs to the file (not overwrite)
tail -n +2 -q datasources/wikidata/results/protein_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add protein sec. IDs to the file (not overwrite)
##Check new data, fail job if query timeout has occured
cd datasources/wikidata/results
fail_file=''
for File in *.tsv ##Only for tsv files
do
if grep -q TimeoutException "$File"; then
echo "Query Timeout occurred for file: " "$File"
echo "Wikidata data will not be updated"
head -n 20 "$File"
echo "TIMEOUT=true" >> $GITHUB_ENV
echo "fail_file=$File" >> $GITHUB_ENV
break
else
echo "No Query Timeout detected for file: " "$File"
fi
done
##Remove previous output files (if existing)
##find . -name 'wikidata*' -exec rm {} \;
## Set prefix to Wikidata for renaming new data files
prefix=$(basename "Wikidata")
for f in *.tsv ##Only for tsv files
do
##Find all new data files | Remove the IRIs (prefix) | remove the IRIs (suffix) | remove language annotation | save the file with new name
cat "$f" | sed 's/<http:\/\/www.wikidata.org\/entity\///g' | sed 's/[>]//g' | sed 's/@en//g' > "${prefix}_$f"
rm "$f"
done
##Change back to main directory
cd ../..
- name: Write up issue about timeout
if: github.TIMEOUT == true
run: |
echo "---" >> issue.md
echo "title: Wikidata is timing out" >> issue.md
echo "assignees: tabbassidaloii" >> issue.md
echo "---" >> issue.md
echo "Wikidata query timed out for $${{ github.fail_file }}" >> issue.md
cat issue.md
- name: Post issue about query timeout
if: github.TIMEOUT == true
uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: issue.md
- name: RegEx and Diff test
if:
${{ env.FAILED != 'true' }}
run: |
chmod +x datasources/wikidata/config
. datasources/wikidata/config .
echo "New files:"
ls datasources/wikidata/results/
file_to_diff="geneProtein_secID2priID.tsv"
old="datasources/wikidata/data/$file_to_diff"
new="datasources/wikidata/results/$file_to_diff"
# remove headers
sed -i '1d' "$new"
sed -i '1d' "$old"
# qc integrity of IDs
wget -nc https://raw.githubusercontent.com/bridgedb/datasources/ main/datasources.tsv
Wikidata_ID=$(awk -F '\t' '$1 == "Wikidata" {print $10}' datasources.tsv)
# Split the file into two separate files for each column
awk -F '\t' '{print $1}' $new > column1.txt
awk -F '\t' '{print $2}' $new > column2.txt
# Use grep to check if any line in the primary column doesn't match the pattern
if grep -nqvE "$Wikidata_ID" "column1.txt"; then
echo "All lines in the primary column match the pattern."
else
echo "Error: At least one line in the primary column does not match pattern."
grep -nvE "^$Wikidata_ID$" "column1.txt"
echo "FAILED=true" >> $GITHUB_ENV
exit 1
fi
# Use grep to check if any line in the secondary column doesn't match the pattern
if grep -nqvE "$Wikidata_ID" "column1.txt"; then
echo "All lines in the secondary column match the pattern."
else
echo "Error: At least one line in the secondary column does not match pattern."
grep -nqvE "$Wikidata_ID" "column2.txt"
echo "FAILED=true" >> $GITHUB_ENV
exit 1
fi
# sort them
cat "$old" | sort | tr -d "\r" > ids_old.txt
cat "$new" | sort | tr -d "\r" > ids_new.txt
echo "Performing diff between the sorted lists of IDs"
# Perform a diff between the sorted lists of IDs
output_file=diff.txt
diff -u ids_old.txt ids_new.txt > $output_file || true
# retrieve new lines
added=$(grep '^+Wikidata' "$output_file" | sed 's/-//g') || true
# retrieve removed lines
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
# Create temporary files
tmp_added=$(mktemp)
tmp_withdrawn=$(mktemp)
tmp_removed=$(mktemp)
# Write the content of the added variable to the temporary file
echo "$added" > "$tmp_added"
# Retrieve matches and store them in another temporary file
grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true
# Append matches to the removed variable
if [ -n "$removed" ]; then
echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
else
cat "$tmp_withdrawn" > "$tmp_removed"
fi
# Remove matching lines from the added variable
sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added"
# Read the updated content back into the variables
added=$(cat "$tmp_added")
removed=$(cat "$tmp_removed")
# Clean up temporary files
rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
added=$added_filtered
removed=$removed_filtered
# count them
count_removed=$(printf "$removed" | wc -l) || true
count_added=$(printf "$added" | wc -l) || true
# make sure we are not counting empty lines
if [ -z "$removed" ]; then
count_removed=0
removed="None"
fi
if [ -z "$added" ]; then
count_added=0
added="None"
fi
echo ________________________________________________
echo " removed pairs "
echo ________________________________________________
echo "$removed"
echo ________________________________________________
echo " added pairs "
echo ________________________________________________
echo "$added"
echo _________________________________________________
echo "What's changed:"
echo "- Added id pairs: $count_added"
echo "- Removed id pairs: $count_removed"
# Store to env to use in issue
echo "ADDED=$count_added" >> $GITHUB_ENV
echo "REMOVED=$count_removed" >> $GITHUB_ENV
count=$(expr $count_added + $count_removed) || true
echo "COUNT=$count" >> $GITHUB_ENV
total_old=$(cat "$old" | wc -l) || true
change=$((100 * count / total_old))
echo "CHANGE=$change" >> $GITHUB_ENV
- name: 'Upload processed data as artifacts'
id: artifact-upload
uses: actions/upload-artifact@v4
with:
name: Wikidata_processed
path: datasources/wikidata/results/*
- uses: JasonEtco/create-an-issue@v2
if: ${{ env.COUNT != 0 }}
name: Post issue about update availability
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: Wikidata
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
update_existing: true
- uses: JasonEtco/create-an-issue@v2
name: Post issue about failing test
if: ${{ env.FAILED == 'true' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: Wikidata
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
update_existing: true
- name: Trigger docker release
uses: peter-evans/repository-dispatch@v3
if: ${{ env.COUNT != 0 }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RELEASE_NUMBER: ${{ needs.query_wikidata.outputs.RELEASE_NUMBER }}
RELEASE_DATE: ${{ needs.query_wikidata.outputs.DATE_NEW }}
with:
token: ${{ secrets.PING }}
repository: sec2pri/omicsFixID
event-type: update-event
client-payload: >
{
"ref": "${{ github.ref }}",
"sha": "${{ github.sha }}",
"datasource": "Wikidata",
"version": "${{ env.RELEASE_NUMBER }}",
"date": "${{ env.RELEASE_DATE }}",
"processed_data": "https://github.com/sec2pri/ mapping_preprocessing/actions/runs/${{ github.run_id }}/ artifacts/${{ steps.artifact-upload.outputs.artifact-id }}"
}