Fixes to ChEBI workflow #83

Workflow file for this run

.github/workflows/wikidata.yml at 1482ec5

	# Workflow for downloading and saving Wikidata secondary2primary mappings
	name: wikidata

	on:
	workflow_dispatch:
	pull_request: # tests whether it is working on PR
	schedule:
	- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
	jobs:
	wikidata:
	runs-on: ubuntu-latest
	permissions:
	contents: write
	# step 1: checkout the repository
	steps:
	- name: Download GitHub repo for the queries
	uses: actions/checkout@v3

	# step 2: run the SPARQL queries from the Wikidata query subfolder
	- name: Run the Queries
	run: \|
	##Make directory if not existing already
	mkdir datasources/wikidata/results
	##Define variable to be used in storing and updating output data (to avoid hardcoding for each change) (tba)

	## Download outdated IDs for chemicals
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_secID2priID.tsv
	## Download all primary IDs for chemicals
	#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_priIDs.tsv
	## Download alias/synonyms/names for chemicals
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_name2synonym.tsv

	## Download outdated IDs for genes (split from proteins to avoid timeouts)
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/gene_secID2priID.tsv
	## Download outdated IDs for proteins (split from genes to avoid timeouts)
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/proteinHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/protein_secID2priID.tsv
	## Download all primary IDs for genes and proteins
	#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_priIDs.tsv
	## Download alias/synonyms/names for genes and proteins
	#curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_name2synonym.tsv

	##Concatenate gene and protein outdated ID data
	head -n 1 datasources/wikidata/results/gene_secID2priID.tsv > datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add the header of one file as start
	tail -n +2 -q datasources/wikidata/results/gene_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add gene sec. IDs to the file (not overwrite)
	tail -n +2 -q datasources/wikidata/results/protein_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add protein sec. IDs to the file (not overwrite)

	##Check new data, fail job if query timeout has occured
	cd datasources/wikidata/results
	fail_file=''
	for File in *.tsv ##Only for tsv files
	do
	if grep -q TimeoutException "$File"; then
	echo "Query Timeout occurred for file: " "$File"
	echo "Wikidata data will not be updated"
	head -n 20 "$File"
	echo "DOWNLOAD_FILE=true" >>$GITHUB_ENV
	fail_file="${fail_file} $File"
	else
	echo "No Query Timeout detected for file: " "$File"
	fi
	done
	# Store value of fail_file in GITHUB_ENV for the issue
	echo "FAILED=${fail_file}" >> $GITHUB_ENV
	##Remove previous output files (if existing)
	##find . -name 'wikidata*' -exec rm {} \;
	## Set prefix to Wikidata for renaming new data files
	prefix=$(basename "Wikidata")
	for f in *.tsv ##Only for tsv files
	do
	##Find all new data files \| Remove the IRIs (prefix) \| remove the IRIs (suffix) \| remove language annotation \| save the file with new name
	cat "$f" \| sed 's/<http:\/\/www.wikidata.org\/entity\///g' \| sed 's/[>]//g' \| sed 's/@en//g' > "${prefix}_$f"
	rm "$f"
	done
	##Change back to main directory
	cd ../..
	##Move and overwrite all files from results folder to data folder, to update previous data
	mv -f wikidata/results/* wikidata/data/
	# step 3: save the data from the queries
	- name: Commit and Push Changes
	run: \|
	git pull
	ls
	if [[ `git status --porcelain` ]]; then
	git add .
	git config --local user.email "[email protected]"
	git config --local user.name "GitHub Action"
	git commit -m "Updating Wd data"
	git push
	else
	echo "No changes to commit."
	exit 0
	fi
	post-issue:
	needs: wikidata
	if: github.DOWNLOAD_FILE == 'true'
	name: Post issue about timeout
	runs-on: ubuntu-latest
	permissions:
	contents: read
	issues: write
	steps:
	- uses: actions/checkout@v3
	- run: \|
	echo "---" >> issue.md
	echo "title: Wikidata is timing out>> issue.md
	echo "assignees: tabbassidaloii" >> issue.md
	echo "---" >> issue.md
	echo "Wikidata query timed out for $${{ github.FAILED }}" >> issue.md
	- uses: JasonEtco/create-an-issue@v2
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	with:
	filename: issue.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fixes to ChEBI workflow #83

Workflow file

Fixes to ChEBI workflow #83

Jobs

Run details

Workflow file for this run