-
Notifications
You must be signed in to change notification settings - Fork 3
284 lines (269 loc) · 11.8 KB
/
chebi.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
name: Check and test ChEBI updates
on:
workflow_dispatch:
pull_request: # tests whether it is working on PR
paths:
- '.github/workflows/chebi.yml'
schedule:
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
permissions:
contents: write
pages: write
id-token: write
issues: write
jobs:
check_new_release:
runs-on: ubuntu-latest
name: Check latest release date
outputs:
RELEASE_NUMBER: ${{ steps.check_download.outputs.RELEASE_NUMBER }}
CURRENT_RELEASE_NUMBER: ${{ steps.check_download.outputs.CURRENT_RELEASE_NUMBER }}
NEW_RELEASE: ${{ steps.check_download.outputs.NEW_RELEASE }}
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }}
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }}
steps:
# checkout the repository
- name: Checkout
uses: actions/checkout@v4
# check the release date for the latest ChEBI release
- name: Check for new ChEBI release
id: check_download
run: |
## Read config
. datasources/chebi/config .
echo 'Accessing the ChEBI archive'
wget https://ftp.ebi.ac.uk/pub/databases/chebi/archive/ -O chebi_index.html
echo "CURRENT_RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT
## Check date for last element in index
##Extract the date from the latest release (up to the day)
date_new=$(tail -4 chebi_index.html | head -1 | grep -oP '<td align="right">\K[0-9-]+\s[0-9:]+(?=\s+</td>)' | awk '{print $1}')
release=$(tail -4 chebi_index.html | head -1 | grep -oP '(?<=a href="rel)\d\d\d')
#https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-an-output-parameter
echo "RELEASE_NUMBER=$release" >> $GITHUB_OUTPUT
##Extract the date from the ChEBI README file
date_old=$date
#Store dates to output
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT
##Compare the dates and set variable if date_new is more recent
timestamp1=$(date -d "$date_new" +%s)
timestamp2=$(date -d "$date_old" +%s)
if [ "$timestamp1" -gt "$timestamp2" ]; then
echo 'New release available', "$release"
echo "NEW_RELEASE=true" >> $GITHUB_OUTPUT
else
echo 'No new release available'
fi
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old"
##Clean up
rm chebi_index.html
test_sdf_processing:
name: Test release
if: needs.check_new_release.outputs.NEW_RELEASE
needs: check_new_release
env:
RELEASE_NUMBER: ${{ needs.check_new_release.outputs.RELEASE_NUMBER }}
DATE_OLD: ${{ needs.check_new_release.outputs.DATE_OLD }}
DATE_NEW: ${{ needs.check_new_release.outputs.DATE_NEW }}
CURRENT_RELEASE_NUMBER: ${{ needs.check_new_release.outputs.CURRENT_RELEASE_NUMBER }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Download SDF for new release
run: |
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV
##Download ChEBI SDF file
echo $RELEASE_NUMBER
# Store outputs from previous job in environment variables
echo "RELEASE_NUMBER=$RELEASE_NUMBER" >> $GITHUB_ENV
echo "CURRENT_RELEASE_NUMBER=$CURRENT_RELEASE_NUMBER" >> $GITHUB_ENV
url_release="https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel$RELEASE_NUMBER/SDF/"
echo "URL_RELEASE=$url_release" >> $GITHUB_ENV
wget "https://ftp.ebi.ac.uk/pub/databases/chebi/archive/rel${RELEASE_NUMBER}/SDF/ChEBI_complete_3star.sdf.gz"
##Unzip gz file:
gunzip ChEBI_complete_3star.sdf.gz #TODO replace by config var
##Check file size if available
ls
##Print file size
# Set up vars from config file
chmod +x datasources/chebi/config
. datasources/chebi/config .
##Create temp. folder to store the data in
mkdir -p mapping_preprocessing/datasources/chebi/data
# step 4: run the Java .jar for ChEBI preprocessing
- name: Set up Java
uses: actions/setup-java@v4
with:
java-version: '11'
distribution: 'temurin'
# Download current version from Zenodo
#- name: Download current mapping file from Zenodo
# id: download_current_zenodo
# env:
# zenodo_token: ${{ secrets.ZENODO }}
# run: |
# # Set up vars from config file
# chmod +x datasources/chebi/config
# . datasources/chebi/config .
# echo "file name: $to_check_from_zenodo"
# # Request Zenodo API to download the file
# curl -H "Authorization: Bearer $zenodo_token" -LJO https://zenodo.org/api/record/$zenodo_file_id/$to_check_from_zenodo
- name: Test SDF processing
id: sdf_process
run: |
inputFile="ChEBI_complete_3star.sdf"
mkdir new
outputDir="datasources/chebi/recentData/"
# Run Java program and capture its exit code
java -cp java/target/mapping_prerocessing-0.0.1-jar-with-dependencies.jar org.sec2pri.chebi_sdf "$inputFile" "$outputDir"
# Check the exit status of the Java program
if [ $? -eq 0 ]; then
# Java program succeeded
echo "Successful preprocessing of ChEBI data."
echo "FAILED=false" >> $GITHUB_ENV
else
# Java program failed
echo "Failed preprocessing of ChEBI data."
echo "FAILED=true" >> $GITHUB_ENV
fi
- name: RegEx and Diff test
if:
${{ env.FAILED == 'false' }}
run: |
chmod +x datasources/chebi/config
. datasources/chebi/config .
old="datasources/chebi/data/$to_check_from_zenodo"
new="datasources/chebi/recentData/$to_check_from_zenodo"
# remove headers
sed -i '1d' "$new"
sed -i '1d' "$old"
# qc integrity of IDs
wget -nc https://raw.githubusercontent.com/bridgedb/datasources/main/datasources.tsv
CHEBI_ID=$(awk -F '\t' '$1 == "ChEBI" {print $10}' datasources.tsv)
# Split the file into two separate files for each column
awk -F '\t' '{print $1}' $new > column1.txt
awk -F '\t' '{print $2}' $new > column2.txt
# Use grep to check if any line in the primary column doesn't match the pattern
if grep -nqvE "$CHEBI_ID" "column1.txt"; then
echo "All lines in the primary column match the pattern."
else
echo "Error: At least one line in the primary column does not match pattern."
grep -nvE "^$CHEBI_ID$" "column1.txt"
echo "FAILED=true" >> $GITHUB_ENV
exit 1
fi
# Use grep to check if any line in the secondary column doesn't match the pattern
if grep -nqvE "$CHEBI_ID" "column1.txt"; then
echo "All lines in the secondary column match the pattern."
else
echo "Error: At least one line in the secondary column does not match pattern."
grep -nqvE "$CHEBI_ID" "column2.txt"
echo "FAILED=true" >> $GITHUB_ENV
exit 1
fi
# sort them
cat "$old" | sort | tr -d "\r" > ids_old.txt
cat "$new" | sort | tr -d "\r" > ids_new.txt
echo "Performing diff between the sorted lists of IDs"
# Perform a diff between the sorted lists of IDs
output_file=diff.txt
diff -u ids_old.txt ids_new.txt > $output_file || true
# retrieve new lines
added=$(grep '^+CHEBI' "$output_file" | sed 's/-//g') || true
# retrieve removed lines
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
# Create temporary files
tmp_added=$(mktemp)
tmp_withdrawn=$(mktemp)
tmp_removed=$(mktemp)
# Write the content of the added variable to the temporary file
echo "$added" > "$tmp_added"
# Retrieve matches and store them in another temporary file
grep 'Entry Withdrawn' "$tmp_added" > "$tmp_withdrawn" || true
# Append matches to the removed variable
if [ -n "$removed" ]; then
echo -e "$removed\n$(cat $tmp_withdrawn)" > "$tmp_removed"
else
cat "$tmp_withdrawn" > "$tmp_removed"
fi
# Remove matching lines from the added variable
sed '/Entry Withdrawn/d' "$tmp_added" > "$tmp_added.tmp" && mv "$tmp_added.tmp" "$tmp_added"
# Read the updated content back into the variables
added=$(cat "$tmp_added")
removed=$(cat "$tmp_removed")
# Clean up temporary files
rm "$tmp_added" "$tmp_withdrawn" "$tmp_removed"
added=$(echo "$added" | sed '/Entry Withdrawn/d')
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
added=$added_filtered
removed=$removed_filtered
# count them
count_removed=$(printf "$removed" | wc -l) || true
count_added=$(printf "$added" | wc -l) || true
# make sure we are not counting empty lines
if [ -z "$removed" ]; then
count_removed=0
removed="None"
fi
if [ -z "$added" ]; then
count_added=0
added="None"
fi
echo ________________________________________________
echo " removed pairs "
echo ________________________________________________
echo "$removed"
echo ________________________________________________
echo " added pairs "
echo ________________________________________________
echo "$added"
echo _________________________________________________
echo "What's changed:"
echo "- Added id pairs: $count_added"
echo "- Removed id pairs: $count_removed"
# Store to env to use in issue
echo "ADDED=$count_added" >> $GITHUB_ENV
echo "REMOVED=$count_removed" >> $GITHUB_ENV
count=$(expr $count_added + $count_removed) || true
echo "COUNT=$count" >> $GITHUB_ENV
total_old=$(cat "$old" | wc -l) || true
change=$((100 * count / total_old))
echo "CHANGE=$change" >> $GITHUB_ENV
- name: 'Upload processed data as artifacts'
uses: actions/upload-artifact@v4
with:
name: chebi_processed
path: datasources/chebi/recentData/*
- uses: JasonEtco/create-an-issue@v2
if: ${{ env.COUNT != 0 }}
name: Post issue about update availability
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: ChEBI
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
update_existing: true
- uses: JasonEtco/create-an-issue@v2
name: Post issue about failing test
if: ${{ env.FAILED == 'true' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: chebi
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
update_existing: true
# Docker
trigger-docker-update:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Trigger docker release
uses: peter-evans/repository-dispatch@v3
with:
token: ${{ secrets.PING }}
repository: sec2pri/omicsFixID
event-type: update-event
client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "datasource": "ChEBI"}'