-
Notifications
You must be signed in to change notification settings - Fork 3
192 lines (172 loc) · 7.58 KB
/
uniprot.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
name: Check and test UniProt updates
on:
workflow_dispatch:
pull_request: # tests whether it is working on PR
paths:
- '.github/workflows/uniprot.yml'
schedule:
- cron: "0 0 1,15 * *" # Run the workflow on the 1st and 15th day of each month
permissions:
contents: write
pages: write
id-token: write
issues: write
jobs:
check_new_data:
runs-on: ubuntu-latest
name: Check the date of the latest data
outputs:
UNIPROT_SPROT_NEW: ${{ steps.check_download.outputs.UNIPROT_SPROT_NEW }}
SEC_AC_NEW: ${{ steps.check_download.outputs.SEC_AC_NEW }}
DELAC_SP_NEW: ${{ steps.check_download.outputs.DELAC_SP_NEW }}
DATE_NEW: ${{ steps.check_download.outputs.DATE_NEW }}
DATE_OLD: ${{ steps.check_download.outputs.DATE_OLD }}
steps:
# step 1: check the release date for the latest UniProt files
- name: Checkout
uses: actions/checkout@v4
- name: Check for new uniprot files
id: check_download
run: |
##Extract the date from the uniprot config file
date_old=$(grep -E '^date=' datasources/uniprot/config | cut -d'=' -f2)
echo 'Accessing the uniprot data'
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/ -O uniprot_index.html
less uniprot_index.html
date_new=$(grep -oP 'uniprot_sprot\.fasta\.gz</a></td><td[^>]*>\K[0-9]{4}-[0-9]{2}-[0-9]{2}' uniprot_index.html)
#Store dates to output
echo "DATE_OLD=$date_old" >> $GITHUB_OUTPUT
echo "DATE_NEW=$date_new" >> $GITHUB_OUTPUT
echo "Date of latest release: $date_new", "Date of release of the current version: $date_old"
test_new_data_processing:
name: Processing new data and check updates
needs: check_new_data
env:
DATE_OLD: ${{ needs.check_new_data.outputs.DATE_OLD }}
DATE_NEW: ${{ needs.check_new_data.outputs.DATE_NEW }}
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
# step 2: download the recent data
- name: Download the recent data
run: |
##Store outputs from previous job in environment variables
echo "$DATE_NEW=$DATE_NEW" >> $GITHUB_ENV
UNIPROT_SPROT_NEW=$(echo uniprot_sprot.fasta.gz)
SEC_AC_NEW=$(echo sec_ac.txt)
DELAC_SP_NEW=$(echo delac_sp.txt)
##Create temp. folder to store the data in
mkdir -p datasources/uniprot/data
##Download uniprot file
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/${UNIPROT_SPROT_NEW}
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/docs/${SEC_AC_NEW}
wget https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/docs/${DELAC_SP_NEW}
mv $DELAC_SP_NEW $SEC_AC_NEW $UNIPROT_SPROT_NEW datasources/uniprot/data
##Check file size if available
ls -trlh datasources/uniprot/data
# step 3: run the Rscripts for uniprot preprocessing
- name: Install R
uses: r-lib/actions/setup-r@v2
with:
r-version: '4.1' # Specify the R version
include-recommended: true
- name: Test uniprot data processing
id: uniprot_process
run: |
sourceVersion=$DATE_NEW
uniprot_sprot="datasources/uniprot/data/${UNIPROT_SPROT_NEW}"
sec_ac="datasources/uniprot/data/${SEC_AC_NEW}"
delac_sp="datasources/uniprot/data/${DELAC_SP_NEW}"
echo Check files
echo $sourceVersion
echo $uniprot_sprot
echo $delac_sp
echo $sec_ac
ls -trlh datasources/uniprot/data
# Run rscripts program and capture its exit code
Rscript r/src/uniprot.R $sourceVersion $uniprot_sprot $delac_sp $sec_ac
# Check the exit status of the R script
if [ $? -eq 0 ]; then
# script succeeded
echo "Successful preprocessing of uniprot data."
echo "FAILED=false" >> $GITHUB_ENV
else
# script failed
echo "Failed preprocessing of uniprot data."
echo "FAILED=true" >> $GITHUB_ENV
fi
# step 4: compare the new and old data
- name: Diff versions
if:
${{ env.FAILED == 'false' }}
run: |
# Set up vars from config file
to_check_from_zenodo=$(grep -E '^to_check_from_zenodo=' datasources/uniprot/config | cut -d'=' -f2)
old="datasources/uniprot/data/$to_check_from_zenodo"
new="datasources/uniprot/recentData/$to_check_from_zenodo"
column_name="secondaryID"
echo $column_name
ls datasources/uniprot/data/
ls datasources/uniprot/recentData/
less $old | head
less $new | head
# Extract the primaryID and secondaryID column from both files and sort them
cat "$old" | sort | tr -d "\r" | cut -f 1,3 > ids_old.txt
cat "$new" | sort | tr -d "\r" | cut -f 1,3 > ids_new.txt
# TODO decide whether to perform diff on symbols too?
# Extract the secondarySymbol column from both files and sort them
##cat "$old" | sort | tr -d "\r" | cut -f 4 > symbols_old.txt
##cat "$new" | sort | tr -d "\r" | cut -f 4 > symbols_new.txt
echo "Performing diff between the sorted lists of IDs"
# Perform a diff between the sorted lists of IDs
output_file=diff.txt
diff -u ids_old.txt ids_new.txt > $output_file || true
less $output_file | head
# retrieve new lines
echo Counting added lines...
added=$(grep '^+' "$output_file" | sed 's/+//g') || true
# retrieve removed lines
echo Counting removed lines...
removed=$(grep '^-' "$output_file" | sed 's/-//g') || true
added_filtered=$(comm -23 <(sort <<< "$added") <(sort <<< "$removed"))
removed_filtered=$(comm -23 <(sort <<< "$removed") <(sort <<< "$added"))
added=$added_filtered
removed=$removed_filtered
# count them
echo Counting differences:
count_removed=$(printf "$removed" | wc -l) || true
count_added=$(printf "$added" | wc -l) || true
# make sure we are not counting empty lines
if [ -z "$removed" ]; then
count_removed=0
fi
if [ -z "$added" ]; then
count_added=0
fi
echo removed:
echo "$removed"
echo added:
echo "$added"
echo Added id pairs: $count_added
echo Removed id pairs: $count_removed
# Store to env to use in issue
echo "ADDED=$count_added" >> $GITHUB_ENV
echo "REMOVED=$count_removed" >> $GITHUB_ENV
echo "URL_RELEASE=https://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/knowledgebase/complete/${UNIPROT_SPROT_NEW}" >> $GITHUB_ENV
# step 5: post issues
- uses: JasonEtco/create-an-issue@v2
name: Post issue about update availability
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: "UniProt"
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_UPDATE.md
- uses: JasonEtco/create-an-issue@v2
name: Post issue about failing test
if: ${{ env.FAILED == 'true' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SOURCE: "UniProt"
with:
filename: .github/ISSUE_TEMPLATE/ISSUE_FAIL.md