-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathverifyHeadings.py
95 lines (85 loc) · 3.23 KB
/
verifyHeadings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
from bs4 import BeautifulSoup as Soup
from rdflib import Namespace, Graph, URIRef
from fuzzywuzzy import fuzz
# Configuration for requests.
headers = {'User-Agent': 'Custom user agent'}
lc = requests.Session()
ft = requests.Session()
baseURL = 'http://id.loc.gov/authorities/'
fastURL = 'http://id.worldcat.org/fast/search?query=cql.any+all+%22'
fastPara = '%22&fl=oclc.heading&recordSchema=info:srw/schema/1/rdf-v2.0'
mads = Namespace('http://www.loc.gov/mads/rdf/v1#')
auth = URIRef('http://id.loc.gov/authorities/')
authorities = {'lcnaf': 'names',
'lcsh': 'subjects',
'genre': 'genreForms'}
def getGraph(url, format):
g = Graph()
try:
data = lc.get(url, timeout=30, headers=headers)
data = data.text
graph = g.parse(data=data, format=format)
except requests.exceptions.Timeout:
graph = None
return graph
def findTermFromLabel(searchTerm, type):
newURL = None
url = baseURL+type+'/label/'+searchTerm
try:
data = lc.get(url, timeout=30, headers=headers)
foundName = data.ok
newURL = data.url
if foundName:
newURL = data.url
if newURL:
newURL = newURL.replace('.html', '')
return newURL
except requests.Timeout:
pass
def getInfoFromGraph(graph, item, searchTerm, type):
if graph:
for result in graph.subject_objects((mads.authoritativeLabel)):
if auth+type in result[0]:
ratio = fuzz.ratio(result[1].value, searchTerm)
if ratio > 95:
print('Heading validated')
item['authURI'] = result[0].toPython()
item['authLabel'] = result[1].value
def verifyHeadingList(searchList):
all_items = []
for item in searchList:
searchTerm = item.get('term')
if searchTerm:
vocab = item.get('vocab')
type = authorities.get(vocab)
print(vocab)
print(searchTerm)
if vocab != 'fast':
if item.get('uri') != 'None':
newURL = item.get('uri')
graph = getGraph(newURL+'.nt', 'nt')
getInfoFromGraph(graph, item, searchTerm, type)
else:
newURL = findTermFromLabel(searchTerm, type)
if newURL:
graph = getGraph(newURL+'.nt', 'nt')
getInfoFromGraph(graph, item, searchTerm, type)
else:
data = ft.get(fastURL+searchTerm+fastPara)
data = data.content
soup = Soup(data, features='lxml')
record = soup.find('record')
if record:
identifier = record.find('dct:identifier')
identifier = identifier.string
authLabel = record.find('skos:preflabel')
authLabel = authLabel.string
if authLabel == searchTerm:
print('Heading validated')
item['authLabel'] = authLabel
item['authURI'] = identifier
del item['uri']
print(item)
all_items.append(item)
return all_items