wikidata #139
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Workflow for downloading and saving Wikidata secondary2primary mappings | |
name: wikidata | |
on: | |
workflow_dispatch: | |
pull_request: # tests whether it is working on PR | |
paths: | |
- '.github/workflows/wikidata.yml' | |
schedule: | |
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month | |
jobs: | |
query_wikidata: | |
runs-on: ubuntu-latest | |
outputs: | |
TIMEOUT: ${{ steps.queries.outputs.TIMEOUT }} | |
permissions: | |
contents: write | |
# step 1: checkout the repository | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
# step 2: run the SPARQL queries from the Wikidata query subfolder | |
- name: Run the Queries | |
id: queries | |
run: | | |
##Make directory if not existing already | |
mkdir datasources/wikidata/results | |
##Define variable to be used in storing and updating output data (to avoid hardcoding for each change) (tba) | |
## Download outdated IDs for chemicals Wikidata Style | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_secID2priID.tsv | |
## Download outdated IDs for chemicals qLever Style | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://qlever.cs.uni-freiburg.de/api/wikidata -o datasources/wikidata/results/metabolites_secID2priID_qlever.tsv | |
## Download all primary IDs for chemicals | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_priIDs.tsv | |
## Download alias/synonyms/names for chemicals | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_name2synonym.tsv | |
## Download outdated IDs for genes (split from proteins to avoid timeouts) | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/gene_secID2priID.tsv | |
## Download outdated IDs for proteins (split from genes to avoid timeouts) | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/proteinHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/protein_secID2priID.tsv | |
## Download all primary IDs for genes and proteins | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_priIDs.tsv | |
## Download alias/synonyms/names for genes and proteins | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_name2synonym.tsv | |
##Concatenate gene and protein outdated ID data | |
head -n 1 datasources/wikidata/results/gene_secID2priID.tsv > datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add the header of one file as start | |
tail -n +2 -q datasources/wikidata/results/gene_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add gene sec. IDs to the file (not overwrite) | |
tail -n +2 -q datasources/wikidata/results/protein_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add protein sec. IDs to the file (not overwrite) | |
##Check new data, fail job if query timeout has occured | |
cd datasources/wikidata/results | |
fail_file='' | |
for File in *.tsv ##Only for tsv files | |
do | |
if grep -q TimeoutException "$File"; then | |
echo "Query Timeout occurred for file: " "$File" | |
echo "Wikidata data will not be updated" | |
head -n 20 "$File" | |
echo "TIMEOUT=true" >>$GITHUB_ENV | |
echo "TIMEOUT=true" >>$GITHUB_OUTPUT | |
fail_file="${fail_file} $File" | |
else | |
echo "No Query Timeout detected for file: " "$File" | |
echo "TIMEOUT=false" >>$GITHUB_ENV | |
echo "TIMEOUT=false" >>$GITHUB_OUTPUT | |
fi | |
done | |
# Store value of fail_file in GITHUB_ENV for the issue | |
echo "TIMEOUT_QUERY=${fail_file}" >> $GITHUB_ENV | |
##Remove previous output files (if existing) | |
##find . -name 'wikidata*' -exec rm {} \; | |
## Set prefix to Wikidata for renaming new data files | |
prefix=$(basename "Wikidata") | |
for f in *.tsv ##Only for tsv files | |
do | |
##Find all new data files | Remove the IRIs (prefix) | remove the IRIs (suffix) | remove language annotation | save the file with new name | |
cat "$f" | sed 's/<http:\/\/www.wikidata.org\/entity\///g' | sed 's/[>]//g' | sed 's/@en//g' > "${prefix}_$f" | |
rm "$f" | |
done | |
##Change back to main directory | |
cd ../.. | |
##Move and overwrite all files from results folder to data folder, to update previous data | |
mv -f wikidata/results/* wikidata/data/ | |
# # step 3: save the data from the queries | |
## - name: Commit and Push Changes | |
# run: | | |
# git pull | |
# ls | |
# if [[ `git status --porcelain` ]]; then | |
# git add . | |
# git config --local user.email "[email protected]" | |
# git config --local user.name "GitHub Action" | |
# git commit -m "Updating Wd data" | |
# git push | |
# else | |
# echo "No changes to commit." | |
# exit 0 | |
# fi | |
- name: Write up issue about timeout | |
if: github.TIMEOUT == true | |
run: | | |
echo "---" >> issue.md | |
echo "title: Wikidata is timing out" >> issue.md | |
echo "assignees: tabbassidaloii" >> issue.md | |
echo "---" >> issue.md | |
echo "Wikidata query timed out for $${{ github.fail_file }}" >> issue.md | |
cat issue.md | |
- name: Post issue about query timeout | |
if: github.TIMEOUT == true | |
uses: JasonEtco/create-an-issue@v2 | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
filename: issue.md | |
- name: RegEx and Diff test | |
if: | |
${{ env.TIMEOUT == 'false' }} | |
run: | | |
chmod +x datasources/wikidata/config | |
. datasources/wikidata/config . | |
ls datasources/wikidata/results/ | |
file_to_diff="geneProtein_secID2priID.tsv" | |
old="datasources/wikidata/data/$file_to_diff" | |
new="datasources/wikidata/results/$file_to_diff" | |
# remove headers | |
sed -i '1d' "$new" | |
sed -i '1d' "$old" | |
# qc integrity of IDs | |
wget -nc https://raw.githubusercontent.com/bridgedb/datasources/ main/datasources.tsv | |
Wikidata_ID=$(awk -F '\t' '$1 == "Wikidata" {print $10}' datasources.tsv) | |
# Split the file into two separate files for each column | |
awk -F '\t' '{print $1}' $new > column1.txt | |
awk -F '\t' '{print $2}' $new > column2.txt | |
# Use grep to check if any line in the primary column doesn't match the pattern | |
if grep -nqvE "$Wikidata_ID" "column1.txt"; then | |
echo "All lines in the primary column match the pattern." | |
else | |
echo "Error: At least one line in the primary column does not match pattern." | |
grep -nvE "^$Wikidata_ID$" "column1.txt" | |
echo "FAILED=true" >> $GITHUB_ENV | |
exit 1 | |
fi | |
# Use grep to check if any line in the secondary column doesn't match the pattern | |
if grep -nqvE "$Wikidata_ID" "column1.txt"; then | |
echo "All lines in the secondary column match the pattern." | |
else | |
echo "Error: At least one line in the secondary column does not match pattern." | |
grep -nqvE "$Wikidata_ID" "column2.txt" | |
echo "FAILED=true" >> $GITHUB_ENV | |
exit 1 | |
fi | |
# sort them | |
cat "$old" | sort | tr -d "\r" > ids_old.txt | |
cat "$new" | sort | tr -d "\r" > ids_new.txt | |
echo "Performing diff between the sorted lists of IDs" | |
# Perform a diff between the sorted lists of IDs | |
output_file=diff.txt | |
diff -u ids_old.txt ids_new.txt > $output_file || true | |
# retrieve new lines | |
added=$(grep '^+Wikidata' "$output_file" | sed 's/-//g') || true | |
# retrieve removed lines | |
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true | |
# Create temporary files | |
tmp_added=$(mktemp) | |
tmp_withdrawn=$(mktemp) | |
tmp_removed=$(mktemp) | |
# Write the content of the added variable to the temporary file | |
echo "$added" > "$tmp_added" | |
# Retrieve matches and store them in another temporary file | |
grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true | |
# Append matches to the removed variable | |
if [ -n "$removed" ]; then | |
echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed" | |
else | |
cat "$tmp_withdrawn" > "$tmp_removed" | |
fi | |
# Remove matching lines from the added variable | |
sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added" | |
# Read the updated content back into the variables | |
added=$(cat "$tmp_added") | |
removed=$(cat "$tmp_removed") | |
# Clean up temporary files | |
rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed" | |
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed")) | |
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added")) | |
added=$added_filtered | |
removed=$removed_filtered | |
# count them | |
count_removed=$(printf "$removed" | wc -l) || true | |
count_added=$(printf "$added" | wc -l) || true | |
# make sure we are not counting empty lines | |
if [ -z "$removed" ]; then | |
count_removed=0 | |
removed="None" | |
fi | |
if [ -z "$added" ]; then | |
count_added=0 | |
added="None" | |
fi | |
echo ________________________________________________ | |
echo " removed pairs " | |
echo ________________________________________________ | |
echo "$removed" | |
echo ________________________________________________ | |
echo " added pairs " | |
echo ________________________________________________ | |
echo "$added" | |
echo _________________________________________________ | |
echo "What's changed:" | |
echo "- Added id pairs: $count_added" | |
echo "- Removed id pairs: $count_removed" | |
# Store to env to use in issue | |
echo "ADDED=$count_added" >> $GITHUB_ENV | |
echo "REMOVED=$count_removed" >> $GITHUB_ENV | |
count=$(expr $count_added + $count_removed) || true | |
echo "COUNT=$count" >> $GITHUB_ENV | |
total_old=$(cat "$old" | wc -l) || true | |
change=$((100 * count / total_old)) | |
echo "CHANGE=$change" >> $GITHUB_ENV | |
- name: 'Upload processed data as artifacts' | |
id: artifact-upload | |
uses: actions/upload-artifact@v4 | |
with: | |
name: Wikidata_processed | |
path: datasources/wikidata/results/* | |
- uses: JasonEtco/create-an-issue@v2 | |
if: ${{ env.COUNT != 0 }} | |
name: Post issue about update availability | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: Wikidata | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md | |
update_existing: true | |
- uses: JasonEtco/create-an-issue@v2 | |
name: Post issue about failing test | |
if: ${{ env.FAILED == 'true' }} | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
SOURCE: Wikidata | |
with: | |
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md | |
update_existing: true | |
- name: Trigger docker release | |
uses: peter-evans/repository-dispatch@v3 | |
if: ${{ env.COUNT != 0 }} | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
RELEASE_NUMBER: ${{ needs.query_wikidata.outputs.RELEASE_NUMBER }} | |
RELEASE_DATE: ${{ needs.query_wikidata.outputs.DATE_NEW }} | |
with: | |
token: ${{ secrets.PING }} | |
repository: sec2pri/omicsFixID | |
event-type: update-event | |
client-payload: > | |
{ | |
"ref": "${{ github.ref }}", | |
"sha": "${{ github.sha }}", | |
"datasource": "Wikidata", | |
"version": "${{ env.RELEASE_NUMBER }}", | |
"date": "${{ env.RELEASE_DATE }}", | |
"processed_data": "https://github.com/sec2pri/ mapping_preprocessing/actions/runs/${{ github.run_id }}/ artifacts/${{ steps.artifact-upload.outputs.artifact-id }}" | |
} |