-
Notifications
You must be signed in to change notification settings - Fork 3
212 lines (193 loc) · 8.46 KB
/
hgnc.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
name: Check and test HGNC updates
on:
workflow_dispatch:
pull_request: # tests whether it is working on PR
paths:
- '.github/workflows/hgnc.yml'
schedule:
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
permissions:
contents: write
pages: write
id-token: write
issues: write
jobs:
check_new_data:
runs-on: ubuntu-latest
name: Check the date of the latest data
outputs:
COMPLETE_NEW: ${{ steps.check_download.outputs.COMPLETE_NEW }}
WITHDRAWN_NEW: ${{ steps.check_download.outputs.WITHDRAWN_NEW }}
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }}
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }}
steps:
# step 1: check the release date for the latest HGNC files
- name: Checkout
uses: actions/checkout@v4
- name: Check for new hgnc files
id: check_download
run: |
##Extract the date from the hgnc config file
date_old=$(grep -E '^date=' datasources/hgnc/config | cut -d'=' -f2)
echo 'Accessing the hgnc data'
wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/ -O hgnc_index.html
# Extracting the latest complete file
complete=$(grep -o 'hgnc_complete_set_[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}\.txt' hgnc_index.html | tail -n 1)
# Extracting the latest withdrawn file
withdrawn=$(grep -o 'withdrawn_[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}\.txt' hgnc_index.html | tail -n 1)
date_new=$(echo "$complete" | awk -F '_' '{print $4}' | sed 's/\.txt//')
#Store dates to output
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT
echo "COMPLETE_NEW=$complete" >> $GITHUB_OUTPUT
echo "WITHDRAWN_NEW=$withdrawn" >> $GITHUB_OUTPUT
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old"
test_new_data_processing:
name: Processing new data and check updates
needs: check_new_data
env:
DATE_OLD: ${{ needs.check_new_data.outputs.DATE_OLD }}
DATE_NEW: ${{ needs.check_new_data.outputs.DATE_NEW }}
COMPLETE_NEW: ${{ needs.check_new_data.outputs.COMPLETE_NEW }}
WITHDRAWN_NEW: ${{ needs.check_new_data.outputs.WITHDRAWN_NEW }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
# step 2: download the recent data
- name: Download the recent data
run: |
##Store outputs from previous job in environment variables
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV
echo "$COMPLETE_NEW=$COMPLETE_NEW" >> $GITHUB_ENV
echo "$WITHDRAWN_NEW=$WITHDRAWN_NEW" >> $GITHUB_ENV
##Create temp. folder to store the data in
mkdir -p datasources/hgnc/data
##Download hgnc file
wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/${WITHDRAWN_NEW}
wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/quarterly/tsv/${COMPLETE_NEW}
mv $WITHDRAWN_NEW $COMPLETE_NEW datasources/hgnc/data
##Check file size if available
ls -trlh datasources/hgnc/data
# step 3: run the Rscripts for hgnc preprocessing
- name: Install R
uses: r-lib/actions/setup-r@v2
with:
r-version: '4.1' # Specify the R version
include-recommended: true
- name: Test hgnc data processing
id: hgnc_process
run: |
sourceVersion=$DATE_NEW
complete="datasources/hgnc/data/${COMPLETE_NEW}"
withdrawn="datasources/hgnc/data/${WITHDRAWN_NEW}"
# Run rscripts program and capture its exit code
Rscript r/src/hgnc.R $sourceVersion $withdrawn $complete
# Check the exit status of the R script
if [ $? -eq 0 ]; then
# script succeeded
echo "Successful preprocessing of hgnc data."
echo "FAILED=false" >> $GITHUB_ENV
else
# script failed
echo "Failed preprocessing of hgnc data."
echo "FAILED=true" >> $GITHUB_ENV
fi
# step 4: compare the new and old data
- name: Diff versions
if:
${{ env.FAILED == 'false' }}
run: |
# Set up vars from config file
to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/hgnc/config | cut -d'=' -f2)
old="datasources/hgnc/data/$to_check_from_zenodo"
new="datasources/hgnc/recentData/$to_check_from_zenodo"
column_name="secondaryID"
# Extract the primaryID and secondaryID column from both files and sort them
cat "$old" | sort | tr -d "\r" | cut -f 1,3 > ids_old.txt
cat "$new" | sort | tr -d "\r" | cut -f 1,3 > ids_new.txt
# TODO decide whether to perform diff on symbols too?
# Extract the secondarySymbol column from both files and sort them
##cat "$old" | sort | tr -d "\r" | cut -f 4 > symbols_old.txt
##cat "$new" | sort | tr -d "\r" | cut -f 4 > symbols_new.txt
echo "Performing diff between the sorted lists of IDs"
# Perform a diff between the sorted lists of IDs
output_file=diff.txt
diff -u ids_old.txt ids_new.txt > $output_file || true
less $output_file | head
# retrieve new lines
added=$(grep '^+' "$output_file" | sed 's/+//g') || true
# retrieve removed lines
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
added=$added_filtered
removed=$removed_filtered
# count them
count_removed=$(printf "$removed" | wc -l) || true
count_added=$(printf "$added" | wc -l) || true
# make sure we are not counting empty lines
if [ -z "$removed" ]; then
count_removed=0
removed="None"
fi
if [ -z "$added" ]; then
count_added=0
added="None"
fi
echo ________________________________________________
echo " removed pairs "
echo ________________________________________________
echo "$removed"
echo ________________________________________________
echo " added pairs "
echo ________________________________________________
echo "$added"
echo _________________________________________________
echo "What's changed:"
echo "- Added id pairs: $count_added"
echo "- Removed id pairs: $count_removed"
# Store to env to use in issue
echo "ADDED=$count_added" >> $GITHUB_ENV
echo "REMOVED=$count_removed" >> $GITHUB_ENV
count=$(expr $count_added + $count_removed) || true
echo "COUNT=$count" >> $GITHUB_ENV
total_old=$(cat "$old" | wc -l) || true
change=$((100 * count / total_old))
echo "CHANGE=$change" >> $GITHUB_ENV
- name: 'Upload processed data as artifacts'
uses: actions/upload-artifact@v3
with:
name: hgnc_processed
path: datasources/hgnc/recentData/*
- uses: JasonEtco/create-an-issue@v2
if: env.COUNT != 0
name: Post issue about update availability
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: "HGNC"
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
update_existing: true
- uses: JasonEtco/create-an-issue@v2
name: Post issue about failing test
if: ${{ env.FAILED == 'true' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: "HGNC"
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md
update_existing: true
# Docker
trigger-docker-update:
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Trigger docker release
uses: peter-evans/repository-dispatch@v3
with:
token: ${{ secrets.PING }}
repository: sec2pri/omicsFixID
event-type: update-event
client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}", "datasource": "HGNC"}'