-
Notifications
You must be signed in to change notification settings - Fork 3
116 lines (109 loc) · 6.13 KB
/
wikidata.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Workflow for downloading and saving Wikidata secondary2primary mappings
name: wikidata
on:
workflow_dispatch:
pull_request: # tests whether it is working on PR
paths:
- '.github/workflows/wikidata.yml'
schedule:
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
jobs:
wikidata:
runs-on: ubuntu-latest
permissions:
contents: write
# step 1: checkout the repository
steps:
- name: Download GitHub repo for the queries
uses: actions/checkout@v3
# step 2: run the SPARQL queries from the Wikidata query subfolder
- name: Run the Queries
run: |
##Make directory if not existing already
mkdir datasources/wikidata/results
##Define variable to be used in storing and updating output data (to avoid hardcoding for each change) (tba)
## Download outdated IDs for chemicals
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_secID2priID.tsv
## Download all primary IDs for chemicals
#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_priIDs.tsv
## Download alias/synonyms/names for chemicals
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_name2synonym.tsv
## Download outdated IDs for genes (split from proteins to avoid timeouts)
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/gene_secID2priID.tsv
## Download outdated IDs for proteins (split from genes to avoid timeouts)
curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/proteinHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/protein_secID2priID.tsv
## Download all primary IDs for genes and proteins
#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_priIDs.tsv
## Download alias/synonyms/names for genes and proteins
#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_name2synonym.tsv
##Concatenate gene and protein outdated ID data
head -n 1 datasources/wikidata/results/gene_secID2priID.tsv > datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add the header of one file as start
tail -n +2 -q datasources/wikidata/results/gene_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add gene sec. IDs to the file (not overwrite)
tail -n +2 -q datasources/wikidata/results/protein_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add protein sec. IDs to the file (not overwrite)
##Check new data, fail job if query timeout has occured
cd datasources/wikidata/results
fail_file=''
for File in *.tsv ##Only for tsv files
do
if grep -q TimeoutException "$File"; then
echo "Query Timeout occurred for file: " "$File"
echo "Wikidata data will not be updated"
head -n 20 "$File"
echo "DOWNLOAD_FILE=true" >>$GITHUB_ENV
fail_file="${fail_file} $File"
else
echo "No Query Timeout detected for file: " "$File"
fi
done
# Store value of fail_file in GITHUB_ENV for the issue
echo "FAILED=${fail_file}" >> $GITHUB_ENV
##Remove previous output files (if existing)
##find . -name 'wikidata*' -exec rm {} \;
## Set prefix to Wikidata for renaming new data files
prefix=$(basename "Wikidata")
for f in *.tsv ##Only for tsv files
do
##Find all new data files | Remove the IRIs (prefix) | remove the IRIs (suffix) | remove language annotation | save the file with new name
cat "$f" | sed 's/<http:\/\/www.wikidata.org\/entity\///g' | sed 's/[>]//g' | sed 's/@en//g' > "${prefix}_$f"
rm "$f"
done
##Change back to main directory
cd ../..
##Move and overwrite all files from results folder to data folder, to update previous data
mv -f wikidata/results/* wikidata/data/
# step 3: save the data from the queries
- name: Commit and Push Changes
run: |
git pull
ls
if [[ `git status --porcelain` ]]; then
git add .
git config --local user.email "[email protected]"
git config --local user.name "GitHub Action"
git commit -m "Updating Wd data"
git push
else
echo "No changes to commit."
exit 0
fi
post-issue:
needs: wikidata
if: github.DOWNLOAD_FILE == 'true'
name: Post issue about timeout
runs-on: ubuntu-latest
permissions:
contents: read
issues: write
steps:
- uses: actions/checkout@v3
- run: |
echo "---" >> issue.md
echo "title: Wikidata is timing out>> issue.md
echo "assignees: tabbassidaloii" >> issue.md
echo "---" >> issue.md
echo "Wikidata query timed out for $${{ github.FAILED }}" >> issue.md
- uses: JasonEtco/create-an-issue@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
filename: issue.md