wikidata #144

Workflow file for this run

.github/workflows/wikidata.yml at 0b28c43

	# Workflow for downloading and saving Wikidata secondary2primary mappings
	name: wikidata

	on:
	workflow_dispatch:
	pull_request: # tests whether it is working on PR
	paths:
	- '.github/workflows/wikidata.yml'
	schedule:
	- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
	jobs:
	query_wikidata:
	runs-on: ubuntu-latest
	outputs:
	TIMEOUT: ${{ steps.queries.outputs.TIMEOUT }}
	permissions:
	contents: write
	# step 1: checkout the repository
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	# step 2: run the SPARQL queries from the Wikidata query subfolder
	- name: Run the Queries
	id: queries
	run: \|
	##Make directory if not existing already
	mkdir datasources/wikidata/results
	##Define variable to be used in storing and updating output data (to avoid hardcoding for each change) (tba)

	## Download outdated IDs for chemicals Wikidata Style
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_secID2priID.tsv
	## Download outdated IDs for chemicals qLever Style
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalRedirects.rq -G https://qlever.cs.uni-freiburg.de/api/wikidata -o datasources/wikidata/results/metabolites_secID2priID_qlever.tsv

	## Download all primary IDs for chemicals
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_priIDs.tsv
	## Download alias/synonyms/names for chemicals
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/chemicalPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/metabolites_name2synonym.tsv

	## Download outdated IDs for genes (split from proteins to avoid timeouts)
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/gene_secID2priID.tsv
	## Download outdated IDs for proteins (split from genes to avoid timeouts)
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/proteinHumanRedirects.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/protein_secID2priID.tsv
	## Download all primary IDs for genes and proteins
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanAllPrimary.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_priIDs.tsv
	## Download alias/synonyms/names for genes and proteins
	curl -H "Accept: text/tab-separated-values" --data-urlencode query@datasources/wikidata/queries/geneproteinHumanPrimarySynonyms.rq -G https://query.wikidata.org/sparql -o datasources/wikidata/results/geneProtein_name2synonym.tsv

	##Concatenate gene and protein outdated ID data
	head -n 1 datasources/wikidata/results/gene_secID2priID.tsv > datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add the header of one file as start
	tail -n +2 -q datasources/wikidata/results/gene_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add gene sec. IDs to the file (not overwrite)
	tail -n +2 -q datasources/wikidata/results/protein_secID2priID.tsv >> datasources/wikidata/results/geneProtein_secID2priID.tsv ##Add protein sec. IDs to the file (not overwrite)

	##Check new data, fail job if query timeout has occured
	cd datasources/wikidata/results
	fail_file=''
	for File in *.tsv ##Only for tsv files
	do
	if grep -q TimeoutException "$File"; then
	echo "Query Timeout occurred for file: " "$File"
	echo "Wikidata data will not be updated"
	head -n 20 "$File"
	echo "TIMEOUT=true" >> $GITHUB_ENV
	echo "fail_file=$File" >> $GITHUB_ENV
	break
	else
	echo "No Query Timeout detected for file: " "$File"
	fi
	done

	##Remove previous output files (if existing)
	##find . -name 'wikidata*' -exec rm {} \;
	## Set prefix to Wikidata for renaming new data files
	prefix=$(basename "Wikidata")
	for f in *.tsv ##Only for tsv files
	do
	##Find all new data files \| Remove the IRIs (prefix) \| remove the IRIs (suffix) \| remove language annotation \| save the file with new name
	cat "$f" \| sed 's/<http:\/\/www.wikidata.org\/entity\///g' \| sed 's/[>]//g' \| sed 's/@en//g' > "${prefix}_$f"
	rm "$f"
	done
	##Change back to main directory
	cd ../..


	- name: Write up issue about timeout
	if: github.TIMEOUT == true
	run: \|
	echo "---" >> issue.md
	echo "title: Wikidata is timing out" >> issue.md
	echo "assignees: tabbassidaloii" >> issue.md
	echo "---" >> issue.md
	echo "Wikidata query timed out for $${{ github.fail_file }}" >> issue.md
	cat issue.md
	- name: Post issue about query timeout
	if: github.TIMEOUT == true
	uses: JasonEtco/create-an-issue@v2
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	with:
	filename: issue.md

	- name: RegEx and Diff test
	if:
	${{ env.FAILED != 'true' }}
	run: \|
	chmod +x datasources/wikidata/config
	. datasources/wikidata/config .
	echo "New files:"
	ls datasources/wikidata/results/
	file_to_diff="geneProtein_secID2priID.tsv"
	old="datasources/wikidata/data/$file_to_diff"
	new="datasources/wikidata/results/$file_to_diff"
	# remove headers
	sed -i '1d' "$new"
	sed -i '1d' "$old"
	# qc integrity of IDs
	wget -nc https://raw.githubusercontent.com/bridgedb/datasources/ main/datasources.tsv
	Wikidata_ID=$(awk -F '\t' '$1 == "Wikidata" {print $10}' datasources.tsv)
	# Split the file into two separate files for each column
	awk -F '\t' '{print $1}' $new > column1.txt
	awk -F '\t' '{print $2}' $new > column2.txt

	# Use grep to check if any line in the primary column doesn't match the pattern
	if grep -nqvE "$Wikidata_ID" "column1.txt"; then
	echo "All lines in the primary column match the pattern."
	else
	echo "Error: At least one line in the primary column does not match pattern."
	grep -nvE "^$Wikidata_ID$" "column1.txt"
	echo "FAILED=true" >> $GITHUB_ENV
	exit 1
	fi

	# Use grep to check if any line in the secondary column doesn't match the pattern
	if grep -nqvE "$Wikidata_ID" "column1.txt"; then
	echo "All lines in the secondary column match the pattern."

	else
	echo "Error: At least one line in the secondary column does not match pattern."
	grep -nqvE "$Wikidata_ID" "column2.txt"
	echo "FAILED=true" >> $GITHUB_ENV
	exit 1
	fi
	# sort them
	cat "$old" \| sort \| tr -d "\r" > ids_old.txt
	cat "$new" \| sort \| tr -d "\r" > ids_new.txt
	echo "Performing diff between the sorted lists of IDs"
	# Perform a diff between the sorted lists of IDs
	output_file=diff.txt
	diff -u ids_old.txt ids_new.txt > $output_file \|\| true
	# retrieve new lines
	added=$(grep '^+Wikidata' "$output_file" \| sed 's/-//g') \|\| true
	# retrieve removed lines
	removed=$(grep '^-' "$output_file" \| sed 's/-//g') \|\| true
	# Create temporary files
	tmp_added=$(mktemp)
	tmp_withdrawn=$(mktemp)
	tmp_removed=$(mktemp)
	# Write the content of the added variable to the temporary file
	echo "$added" > "$tmp_added"
	# Retrieve matches and store them in another temporary file
	grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" \|\| true
	# Append matches to the removed variable
	if [ -n "$removed" ]; then
	echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
	else
	cat "$tmp_withdrawn" > "$tmp_removed"
	fi
	# Remove matching lines from the added variable
	sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added"
	# Read the updated content back into the variables
	added=$(cat "$tmp_added")
	removed=$(cat "$tmp_removed")
	# Clean up temporary files
	rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
	added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
	removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
	added=$added_filtered
	removed=$removed_filtered
	# count them
	count_removed=$(printf "$removed" \| wc -l) \|\| true
	count_added=$(printf "$added" \| wc -l) \|\| true
	# make sure we are not counting empty lines
	if [ -z "$removed" ]; then
	count_removed=0
	removed="None"
	fi
	if [ -z "$added" ]; then
	count_added=0
	added="None"
	fi
	echo ________________________________________________
	echo " removed pairs "
	echo ________________________________________________
	echo "$removed"
	echo ________________________________________________
	echo " added pairs "
	echo ________________________________________________
	echo "$added"
	echo _________________________________________________
	echo "What's changed:"
	echo "- Added id pairs: $count_added"
	echo "- Removed id pairs: $count_removed"
	# Store to env to use in issue
	echo "ADDED=$count_added" >> $GITHUB_ENV
	echo "REMOVED=$count_removed" >> $GITHUB_ENV
	count=$(expr $count_added + $count_removed) \|\| true
	echo "COUNT=$count" >> $GITHUB_ENV
	total_old=$(cat "$old" \| wc -l) \|\| true
	change=$((100 * count / total_old))
	echo "CHANGE=$change" >> $GITHUB_ENV

	- name: 'Upload processed data as artifacts'
	id: artifact-upload
	uses: actions/upload-artifact@v4
	with:
	name: Wikidata_processed
	path: datasources/wikidata/results/*

	- uses: JasonEtco/create-an-issue@v2
	if: ${{ env.COUNT != 0 }}
	name: Post issue about update availability
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	SOURCE: Wikidata
	with:
	filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
	update_existing: true

	- uses: JasonEtco/create-an-issue@v2
	name: Post issue about failing test
	if: ${{ env.FAILED == 'true' }}
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	SOURCE: Wikidata
	with:
	filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
	update_existing: true

	- name: Trigger docker release
	uses: peter-evans/repository-dispatch@v3
	if: ${{ env.COUNT != 0 }}
	env:
	GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	RELEASE_NUMBER: ${{ needs.query_wikidata.outputs.RELEASE_NUMBER }}
	RELEASE_DATE: ${{ needs.query_wikidata.outputs.DATE_NEW }}
	with:
	token: ${{ secrets.PING }}
	repository: sec2pri/omicsFixID
	event-type: update-event
	client-payload: >
	{
	"ref": "${{ github.ref }}",
	"sha": "${{ github.sha }}",
	"datasource": "Wikidata",
	"version": "${{ env.RELEASE_NUMBER }}",
	"date": "${{ env.RELEASE_DATE }}",
	"processed_data": "https://github.com/sec2pri/ mapping_preprocessing/actions/runs/${{ github.run_id }}/ artifacts/${{ steps.artifact-upload.outputs.artifact-id }}"
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

wikidata #144

Workflow file

wikidata #144

Jobs

Run details

Workflow file for this run