-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhull_scraper.py
377 lines (318 loc) · 13 KB
/
hull_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import requests as req
import BeautifulSoup as bs
from urlparse import urljoin, urlsplit
from collections import deque
import datetime
import re
from py2neo import Graph, Relationship, Node, NodeSelector
import dateparser as dp
import time
import settings_module as settings
root = settings.ROOT
master_list = "/php/cssbct/cgi-bin/gedidex.php/n=royal?denmark"
session = req.Session()
a = req.adapters.HTTPAdapter(max_retries=3)
session.mount('http://', a)
client = Graph(password =s ettings.NEO_PASSWORD)
def get_regent_list():
# Send back the cookie
resp = session.get(root+master_list)
if not resp.status_code == 200:
raise Exception('Response of {} failed with status {} \n\n Content: {}'.format(master_list, resp.status_code, resp.text))
jar = resp.cookies
regent_list = session.get(root+master_list, cookies=jar)
if not regent_list.status_code == 200:
raise Exception('Cookie authenticated response of {} failed with status {} \n\n Content: {}'.format(master_list, regent_list.status_code, regent_list.text))
soup = bs.BeautifulSoup(regent_list.content)
# Turn the html into a list of links
regent_list = [el.find('a').get('href') for el in soup.findAll('dt')]
return regent_list
def get_unfinished_person():
crs = client.run("match (a:Person) where not exists(a._cat) return a limit 1")
return crs.next()['a']
def to_path(url):
""" Returns the path and query for a given url"""
split = urlsplit(url)
path = split.path + '?' + split.query
if path == '?':
raise Exception('Empty path found')
return path
def parse_date(date_string):
""" Parse a date string of the format 15 JUN 1902"""
date_pat = re.compile(r'(?P<date>\d+)*?\s*(?P<month>JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)?\s*(?P<year>\d{3,4})')
date = re.search(date_pat, date_string).groupdict()
# Pad years < 1000 as dateparse otherwise cannot parse them
date['year'] = "0"*(4-len(date['year'])) + date['year']
date = dp.parse(" ".join([val if val else "01" for val in date.values()]))
return date.year, str(date.date())
class Loggable():
def __init__(self, id):
self.id = id
def log(self, message):
with open('logs.txt', 'a') as fl:
fl.write(str(datetime.datetime.now().isoformat()) + "{}: {}\n".format(self.id, message))
class Persistable():
def __init__(self, caller):
self.caller = caller
def persist(self):
"""
Persist this to neo4j
"""
neo = self.caller.to_neo()
client.merge(neo)
neo.update(self.caller.props)
client.push(neo)
def is_persisted(self):
"""
Check if this already is persisted to neo4j
TODO: Make code less hateable
"""
neo = self.caller.to_neo()
label = list(neo.labels())[0]
neo.update(self.caller.props)
selector = NodeSelector(client)
return len(list(selector.select(label, **dict(neo)))) > 0
class Person():
def __init__(self, url):
self.root = "http://www.hull.ac.uk"
self.path = to_path(url)
self.log = Loggable(self.path).log
self.persist = Persistable(self).persist
self.is_persisted = Persistable(self).is_persisted
self.props = {'_cat':True}
def __eq__(self, other):
if isinstance(other, Person):
return self.path == other.path
return self.path == other
def fetch(self):
"""
Populate person from the Hull database
"""
soup = self.__get_person_doc()
# We do not know what properties will exist for a given character, so set it as a dict
self.props.update(self.__get_person_properties(soup))
self.children = self.__get_children(soup)
self.parents = self.__get_parents(soup)
self.marriages = self.__get_marriages(soup)
self.spouses = [spouse for spouse, _ in self.marriages]
def to_neo(self):
"""
Return a neo4j representaion of this node
"""
return Node("Person", path=self.path)
def __get_person_doc(self):
""" Get a BeautifulSoup representaion of a document for a specific person"""
headers = {'user-agent': 'royalty-bot/0.0.1'}
resp = session.get(urljoin(self.root, self.path), timeout=5, headers=headers)
if not resp.status_code == 200:
raise req.exceptions.ConnectionError('Response of {} failed with status {} \n\n Content: {}'.format(self.path, resp.status_code, resp.text))
return bs.BeautifulSoup(resp.content)
def __get_person_properties(self, soup):
""" Get a dict of all the basic properties of an entry
return:
props -- dict of properties, none guaranteed to be there
"""
try:
name_string = soup.find('h2').text
props = self.__parse_title_string(name_string)
try:
self.log(props['log_message'])
del props['log_message']
except KeyError:
pass
except:
self.log("error parsing h2 tag.")
props = {}
# Get the date based properties
props_to_find = ['Born', 'Acceded', 'Died']
for prop in props_to_find:
tag = soup.find(text=prop)
if tag:
try:
year, date_string = parse_date(tag.parent.nextSibling)
props[prop+'_year'] = year
props[prop] = date_string
except:
pass
self.log('Error in recovering {} property'.format(prop))
return props
def __get_marriages(self, soup):
"""
Return the marriages for the person as a list of tuples of (path, date of marriage)
"""
def parse_marriage(marriage):
"""
Find the path of the spouse and possibly the date of marriages
arguments:
marriage -- BeautifulSoup navigable string
"""
try:
to = to_path(marriage.findNext('a').get('href'))
except AttributeError:
self.log('Error parsing marriage: {}'.format(str(marriage)))
to = None
# Parse date using the dateparse module
year = None
date_string = None
try:
year, date_string = parse_date(marriage.next)
except AttributeError:
pass
except TypeError:
self.log("Date found while parsing marriage, but dateparse could not parse it")
return (to, {'date': date_string, 'year': year})
return [parse_marriage(marriage) for marriage in soup.findAll(text='Married')]
def __get_children(self, soup):
"""Return the paths of children contained in a document about a person"""
# Extract the hrefs of the children from the DOM
try:
child_els = [el.parent.nextSibling.nextSibling.get('href') for el in soup.findAll(text='Child')]
# Ensure that only the path of the url is returned
child_els = [to_path(child) for child in child_els]
except AttributeError:
child_els = []
return child_els
def __parse_title_string(self, s):
"""
Returns name, string and title of based on the input.
Arguments:
s -- string formatted as FAMILY, NAME, TITLE as per Hull DB standard
returns:
res -- dict of properties contained in s, along with possible log message
"""
res = {}
arr = s.split(',')
# Assumed case as extraneous information sometimes is added to the end of the string
if len(arr) >= 3:
res['Family'] = arr[0].strip()
res['Name'] = arr[1].strip()
res['Title'] = arr[2].strip()
# We're assuming title might be left out here
if len(arr) == 2:
res['Family'] = arr[0].strip()
res['Name'] = arr[1].strip()
# Most probably the name will be in string
if len(arr) == 1:
res['Name'] = arr[0].strip()
return res
def __get_parents(self, soup):
"""
Return a link to the parents
"""
# Father and mother tags are not navigable, so we'll need to find them in the list of paragraphs
ps = soup.findAll('p')
parents = {}
for i,p in enumerate(ps):
if p.text.lower().find('father') != -1:
father = p.find('a')
parents['Father'] = to_path(father.get('href'))
if p.text.lower().find('mother') != -1:
mother = p.find('a')
parents['Mother'] = to_path(mother.get('href'))
return parents
def get_persisted(self):
"""
Return persisted version
"""
query = "MATCH (a) WHERE a.path = '{}' RETURN a LIMIT 1"
match = client.data(query)
if len(match) == 0:
return None
else:
return match[0]
class Relation():
def __init__(self, source, target, props={}):
if isinstance(source, Person):
self.source = source.path
else:
self.source = source
if isinstance(target, Person):
self.target = target.path
else:
self.target = target
self.props = props
self.persist = Persistable(self).persist
self.is_persisted = Persistable(self).is_persisted
class MarriedTo(Relation):
def to_neo(self):
return Relationship(Node('Person', path=self.source), 'married_to', Node('Person', path=self.target), **self.props)
class ChildOf(Relation):
def to_neo(self):
return Relationship(Node('Person', path=self.source), 'child_of', Node('Person', path=self.target))
class ParentOf(Relation):
def to_neo(self):
return Relationship(Node('Person', path=self.source), 'parent_of', Node('Person', path=self.target))
class HoldsTitle(Relation):
def to_neo(self):
return Relationship(Node('Person', path=self.source), 'holds_title', Node('Title', name=self.target), **self.props)
def graph_search(start_node_path, limit=float('Inf')):
"""
Exchaustively search the genealogical tree in a way that lets us record
the type of relationship between nodes
"""
stack = []
count = 0
get_frequency = 5
time.sleep(get_frequency)
try:
node = Person(start_node_path)
if not node.is_persisted():
try:
node.fetch()
except (req.exceptions.ConnectionError, req.exceptions.ConnectTimeout) as e:
node.log(e.message)
return
node.persist()
t1 = time.time()
for spouse, props in node.marriages:
MarriedTo(node, spouse, props).persist()
stack.append(spouse)
for child in node.children:
ParentOf(node, child).persist()
stack.append(child)
for parent in node.parents.values():
ChildOf(node, parent).persist()
stack.append(parent)
if node.props.has_key('Title'):
title = node.props['Title']
props = {}
if node.props.has_key('Acceded'):
props['acceded'] = node.props['Acceded']
props['acceded_year'] = node.props['Acceded_year']
HoldsTitle(node, title, props).persist()
while len(stack) > 0 and count <= limit:
if count % 20 == 0:
print "Processed nodes: {}\nLength of stack: {}".format(count, len(stack))
node = Person(stack.pop())
if not node.is_persisted():
time_elapsed = time.time() - t1
if time_elapsed < get_frequency:
time.sleep(get_frequency - time_elapsed)
try:
node.fetch()
except (req.exceptions.ConnectionError, req.exceptions.ConnectTimeout) as e:
node.log(e.message)
break
node.persist()
t1 = time.time()
for spouse, props in node.marriages:
MarriedTo(node, spouse, props).persist()
stack.append(spouse)
for child in node.children:
ParentOf(node, child).persist()
stack.append(child)
for parent in node.parents.values():
ChildOf(node, parent).persist()
stack.append(parent)
if node.props.has_key('Title'):
title = node.props['Title']
props = {}
if node.props.has_key('Acceded'):
props['acceded'] = node.props['Acceded']
props['acceded_year'] = node.props['Acceded_year']
HoldsTitle(node, title, props).persist()
count += 1
except Exception as e:
node.log(e.message)
raise e
return (stack, count)