-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
178 lines (145 loc) · 7.14 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from itertools import count
import requests
from airtable import airtable
# https://pypi.org/project/airtable/
import json
from dateutil.parser import parse
import os
AIRTABLE_API_KEY = os.getenv('AIRTABLE_API_KEY')
AIRTABLE_BASE_ID = os.getenv('AIRTABLE_BASE_ID')
AIRTABLE_TABLE_NAME = 'Funding Opportunities'
GRANTS_GOV_KEYWORDS = [
'Algeria',
'MENA',
'Middle East and Northern Africa',
]
airtable_api = airtable.Airtable(AIRTABLE_BASE_ID, AIRTABLE_API_KEY)
# START POPULTAING LIST
# keep track of what page number we're on
current_page = 1
# each page has 25 results, starting with index: 0.
# this means results 0-24 are on the first page, the second page has results 25-49, and so on
start_record_num = ((current_page - 1) * 25)
keyword_phrase = '%2C%20'.join(GRANTS_GOV_KEYWORDS)
post_data = '{"startRecordNum":"'+str(start_record_num)+'","keyword":"'+keyword_phrase+'","oppNum":"","cfda":"","oppStatuses":"forecasted|posted"}'
# DON'T CHANGE THE FOLLOWING LINES:
post_url = 'https://www.grants.gov/grantsws/rest/opportunities/search/'
headers = {
'Content-Type': 'application/json; charset=utf-8',
}
# master list that'll hold all of the results
master_hits = []
master_list = []
# make the first response. This will give us results, AND how many results total to expect
response = requests.post(post_url, headers=headers, data=post_data)
# convert it to JSON
json_response = json.loads(response.text)
# grab the total hit count -- this is what we'll use to tell when we've gathered all of the results
total_hits = json_response['hitCount']
# grab the first page of results and add them to our master hits list
master_hits.extend(json_response['oppHits'])
# record how many hits we've ingested so far
total_hits_ingested = len(json_response['oppHits'])
# continue to loop through pages until we've gathered all of the hits there are in our search results
while len(master_hits) < int(total_hits):
# increment the current page and recalculate the starting record
current_page += 1
start_record = ((current_page - 1) * 25)
# update the post data to hold the new "starting record" number
post_data = '{"startRecordNum":"'+str(start_record)+'","keyword":"'+keyword_phrase+'","oppNum":"","cfda":"","oppStatuses":"forecasted|posted"}'
# ...and make a new post request
response = requests.post(post_url, headers=headers, data=post_data)
json_response = json.loads(response.text)
# add these into the master list
master_hits.extend(json_response['oppHits'])
# at this point, our master hits list SHOULD be the same length as the number of total hits, which means we've collected all there is.
# now load all of our records into the table
for record in master_hits:
# for open date and close date, some of them are empty strings (not sure why?)
# so we need to pass in a `None` value rather than the empty string as Airtables can't parse an empty string as a date
open_date = None
if record['openDate'] != '':
open_date = record['openDate'].replace(' ', '')
close_date = None
if record['closeDate'] != '':
close_date = record['closeDate'].replace(' ', '')
# data: assemble! :mjolnir:
data = {
'Title': record['title'],
'Posted Date': open_date,
'Closed Date': close_date,
'Link': f"https://www.grants.gov/web/grants/view-opportunity.html?oppId={record['id']}",
'Source': 'Grants.gov'
}
#add grant.gov scrapped data into the master_list
master_list.append(data)
#**************************************************************************************************
#European Scrapper that adds to the master_list
EUROPEAN_COMMISION_KEYWORDS = [
'Algeria',
'MENA'
]
# page number
current_page = 1
start_record_num = current_page
keyword_phrase = ', '.join(EUROPEAN_COMMISION_KEYWORDS)
post_data = {
"apiKey": "SEDIA",
"text": keyword_phrase+"*",
"pageSize": "1000",
"pageNumber": "1",
}
# DON'T CHANGE THE FOLLOWING LINES:
post_url = "https://api.tech.ec.europa.eu/search-api/prod/rest/search"
headers = {
'Content-Type': 'application/json; charset=utf-8',
}
# master list that'll hold all of the results
master_hits = []
# make the first response. This will give us results, AND how many results total to expect
response = requests.post(
post_url,
headers=headers,
params=post_data,
data='-----------------------------280098105110154896432867507591\r\nContent-Disposition: form-data; name="query"; filename="blob"\r\nContent-Type: application/json\r\n\r\n{"bool":{"must":[{"terms":{"type":["0","1","2","8"]}},{"terms":{"status":["31094501","31094502","31094503"]}}]}}\r\n-----------------------------280098105110154896432867507591\r\nContent-Disposition: form-data; name="languages"; filename="blob"\r\nContent-Type: application/json\r\n\r\n["en"]\r\n-----------------------------280098105110154896432867507591\r\nContent-Disposition: form-data; name="sort"; filename="blob"\r\nContent-Type: application/json\r\n\r\n{"field":"sortStatus","order":"ASC"}\r\n-----------------------------280098105110154896432867507591--\r\n'
)
json_response = json.loads(response.text)
distinct_results = {}
for result in json_response['results']:
if result['content'] != '' and result['content'] not in distinct_results:
distinct_results[result['content']] = result
for result,data in distinct_results.items():
master_hits.append(data)
# now load all of our records into the table
for record in master_hits:
# for open date and close date, some of them are empty strings (not sure why?)
# so we need to pass in a `None` value rather than the empty string as Airtables can't parse an empty string as a date
open_date = None
if record['metadata']['startDate'] != '':
open_date = record['metadata']['startDate'][0]
open_date = parse(open_date).strftime('%Y-%m-%d')
close_date = None
if 'deadlineDate' in record['metadata'] and len(record['metadata']['deadlineDate']) > 0:
close_date = record['metadata']['deadlineDate'][-1]
close_date = parse(close_date).strftime('%Y-%m-%d')
# data: assemble! :mjolnir:
data = {
'Title': record['content'],
'Posted Date': open_date,
'Closed Date': close_date,
'Link': f"https://ec.europa.eu/info/funding-tenders/opportunities/portal/screen/opportunities/topic/{record['metadata']['identifier'][0].lower()}",
'Source': 'Eurpean Commission'
}
#add data collected from the European Script to the master_list
master_list.append(data)
# ----------------------------
# BEGIN UPDATING AIRTABLE
# clear the entire table
data = airtable_api.get(AIRTABLE_TABLE_NAME)
for table_record in data['records']:
# a table record looks like: OrderedDict([('id', 'recmZ3JFwTsUcJWSo'), ('createdTime', '2022-06-14T12:47:39.000Z'), ('fields', OrderedDict([('Posted Date', '2022-06-01'), ('Title', 'Title 2'), ('Close Date', '2022-06-23'), ('Link', 'https://www.google.com')]))])
# just delete it, it'll get re-added if it's in our fetch above
airtable_api.delete(AIRTABLE_TABLE_NAME, table_record['id'])
#Populate the total data in a single airtable
for record in master_list:
airtable_api.create(AIRTABLE_TABLE_NAME, record)