Fixes to ChEBI workflow #112
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Workflow for downloading and saving Wikidata secondary2primary mappings | |
name: wikidata | |
on: | |
workflow_dispatch: | |
pull_request: # tests whether it is working on PR | |
paths: | |
- '.github/workflows/wikidata.yml' | |
schedule: | |
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month | |
jobs: | |
wikidata: | |
runs-on: ubuntu-latest | |
permissions: | |
contents: write | |
# step 1: checkout the repository | |
steps: | |
- name: Download GitHub repo for the queries | |
uses: actions/checkout@v3 | |
# step 2: run the SPARQL queries from the Wikidata query subfolder | |
- name: Run the Queries | |
run: | | |
##Make directory if not existing already | |
mkdir datasources/wikidata/results | |
##Define variable to be used in storing and updating output data (to avoid hardcoding for each change) (tba) | |
## Download outdated IDs for chemicals | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_secID2priID.tsv | |
## Download all primary IDs for chemicals | |
#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_priIDs.tsv | |
## Download alias/synonyms/names for chemicals | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_name2synonym.tsv | |
## Download outdated IDs for genes (split from proteins to avoid timeouts) | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/gene_secID2priID.tsv | |
## Download outdated IDs for proteins (split from genes to avoid timeouts) | |
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/proteinHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/protein_secID2priID.tsv | |
## Download all primary IDs for genes and proteins | |
#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_priIDs.tsv | |
## Download alias/synonyms/names for genes and proteins | |
#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_name2synonym.tsv | |
##Concatenate gene and protein outdated ID data | |
head -n 1 datasources/wikidata/results/gene_secID2priID.tsv > datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add the header of one file as start | |
tail -n +2 -q datasources/wikidata/results/gene_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add gene sec. IDs to the file (not overwrite) | |
tail -n +2 -q datasources/wikidata/results/protein_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add protein sec. IDs to the file (not overwrite) | |
##Check new data, fail job if query timeout has occured | |
cd datasources/wikidata/results | |
fail_file='' | |
for File in *.tsv ##Only for tsv files | |
do | |
if grep -q TimeoutException "$File"; then | |
echo "Query Timeout occurred for file: " "$File" | |
echo "Wikidata data will not be updated" | |
head -n 20 "$File" | |
echo "DOWNLOAD_FILE=true" >>$GITHUB_ENV | |
fail_file="${fail_file} $File" | |
else | |
echo "No Query Timeout detected for file: " "$File" | |
fi | |
done | |
# Store value of fail_file in GITHUB_ENV for the issue | |
echo "FAILED=${fail_file}" >> $GITHUB_ENV | |
##Remove previous output files (if existing) | |
##find . -name 'wikidata*' -exec rm {} \; | |
## Set prefix to Wikidata for renaming new data files | |
prefix=$(basename "Wikidata") | |
for f in *.tsv ##Only for tsv files | |
do | |
##Find all new data files | Remove the IRIs (prefix) | remove the IRIs (suffix) | remove language annotation | save the file with new name | |
cat "$f" | sed 's/<http:\/\/www.wikidata.org\/entity\///g' | sed 's/[>]//g' | sed 's/@en//g' > "${prefix}_$f" | |
rm "$f" | |
done | |
##Change back to main directory | |
cd ../.. | |
##Move and overwrite all files from results folder to data folder, to update previous data | |
mv -f wikidata/results/* wikidata/data/ | |
# step 3: save the data from the queries | |
- name: Commit and Push Changes | |
run: | | |
git pull | |
ls | |
if [[ `git status --porcelain` ]]; then | |
git add . | |
git config --local user.email "[email protected]" | |
git config --local user.name "GitHub Action" | |
git commit -m "Updating Wd data" | |
git push | |
else | |
echo "No changes to commit." | |
exit 0 | |
fi | |
post-issue: | |
needs: wikidata | |
if: github.DOWNLOAD_FILE == 'true' | |
name: Post issue about timeout | |
runs-on: ubuntu-latest | |
permissions: | |
contents: read | |
issues: write | |
steps: | |
- uses: actions/checkout@v3 | |
- run: | | |
echo "---" >> issue.md | |
echo "title: Wikidata is timing out>> issue.md | |
echo "assignees: tabbassidaloii" >> issue.md | |
echo "---" >> issue.md | |
echo "Wikidata query timed out for $${{ github.FAILED }}" >> issue.md | |
- uses: JasonEtco/create-an-issue@v2 | |
env: | |
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
with: | |
filename: issue.md | |