forked from deternitydx/SNAC-EAC-Parser
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_triples.py
177 lines (158 loc) · 7.03 KB
/
generate_triples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import codecs
import os
import fileinput
import sys
# Import XML parser
import xml.etree.ElementTree as ET
# Set up the triple output
#output = codecs.open(sys.stdout, encoding='utf-8', mode='w')
output = codecs.getwriter('utf8')(sys.stdout)
output.write("@prefix snac: <http://socialarchive.iath.virginia.edu/control/term#> .\n")
output.write("@prefix snacead: <http://socialarchive.iath.virginia.edu/control/term#ead/> .\n")
output.write("@prefix foaf: <http://xmlns.com/foaf/0.1/> .\n")
output.write("@prefix owl: <http://www.w3.org/2002/07/owl#> .\n")
output.write("@prefix schema: <http://schema.org/> .\n")
output.write("@prefix edm: <http://www.europeana.eu/schemas/edm/> .\n")
output.write("@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n")
output.write("@prefix rdaGr2: <http://RDVocab.info/ElementsGr2/> .\n")
output.write("@prefix dc: <http://purl.org/dc/elements/1.1/> .\n")
output.write("@prefix dcterms: <http://purl.org/dc/terms/> .\n")
# Define the namespaces to use
namespaces = { "snac" : "urn:isbn:1-931666-33-4" ,
"snac2" : "http://socialarchive.iath.virginia.edu/control/term#",
"schema" : "http://schema.org/",
"xlink" : "http://www.w3.org/1999/xlink",
"snac3" : "http://socialarchive.iath.virginia.edu/"}
# Register the namespaces
ET.register_namespace("snac", "urn:isbn:1-931666-33-4")
ET.register_namespace("snac2", "http://socialarchive.iath.virginia.edu/control/term#")
ET.register_namespace("snac3", "http://socialarchive.iath.virginia.edu/")
ET.register_namespace("xlink", "http://www.w3.org/1999/xlink")
# For each file given on standard input, parse and look at
for filename in fileinput.input():
tree = ET.parse(filename.strip())
root = tree.getroot()
# Definitions
identifier = ""
subjects = []
nationalities = []
alt_names = []
name = ""
places = []
occupations = []
languages = []
referencedin = []
creatorof = []
associated = []
corresponded = []
sameas = []
entity_type = ""
start = None
end = None
# Handle Unique Identifier
node = root.find(".//snac:recordId", namespaces)
identifier = node.text
# Handle birth/death/active dates
node = root.find(".//snac:existDates", namespaces)
if node is not None:
tmp = node.find(".//snac:fromDate", namespaces)
if tmp is not None:
start = tmp.get("standardDate")
tmp = node.find(".//snac:toDate", namespaces)
if tmp is not None:
end = tmp.get("standardDate")
# Handle entity type
node = root.find(".//snac:entityType", namespaces)
if node.text == "person":
entity_type = "edm:Agent"
elif node.text == "corporateBody":
entity_type = "edm:Agent"
else:
entity_type = "edm:Agent"
hidden_type = node.text
# Handle names
first = True;
for node in root.findall(".//snac:nameEntry", namespaces):
if first:
name = node[0].text
first = False
else:
alt_names.append(node[0].text)
# Handle local descriptions (subjects, nationalities)
for node in root.findall(".//snac:localDescription", namespaces):
for attr in node.attrib.items():
if "AssociatedSubject" in attr[1]:
subjects.append(node[0].text)
if "nationalityOfEntity" in attr[1]:
nationalities.append(node[0].text)
# Handle places (only include likelySame places from GeoNames)
for node in root.findall(".//snac3:placeEntryLikelySame", namespaces):
places.append(node.get("vocabularySource"))
# Handle occupations
for node in root.findall(".//snac:occupation", namespaces):
occupations.append(node[0].text)
# Handle languages
for node in root.findall(".//snac:languageUsed", namespaces):
languages.append(node[0].get("languageCode"))
# Handle cpf relationships
for node in root.findall(".//snac:cpfRelation", namespaces):
role = node.get("{http://www.w3.org/1999/xlink}arcrole")
link = node.get("{http://www.w3.org/1999/xlink}href")
if "associatedWith" in role:
associated.append(link)
elif "correspondedWith" in role:
corresponded.append(link)
elif "sameAs" in role:
sameas.append(link)
# Handle resource relationships
for node in root.findall(".//snac:resourceRelation", namespaces):
role = node.get("{http://www.w3.org/1999/xlink}arcrole")
link = node.get("{http://www.w3.org/1999/xlink}href")
if "referencedIn" in role:
referencedin.append(link)
elif "creatorOf" in role:
creatorof.append(link)
# Write out the triples for this file
# NOTE: We must take care of entities that start or end with a ', as that will create elements with '''' and cause problems
# This was currently handled after the triples were generated, but should be handled during generation
output.write(''.join(["<",identifier,"> a <", entity_type, "> .\n"]))
output.write(''.join(["<",identifier,"> skos:prefLabel '''", name, "''' .\n"]))
if start is not None:
output.write(''.join(["<",identifier,"> edm:start '''", start, "''' .\n"]))
if end is not None:
output.write(''.join(["<",identifier,"> edm:end '''", end, "''' .\n"]))
if hidden_type == 'person':
output.write(''.join(["<",identifier,"> foaf:name '''", name, "''' .\n"]))
for altname in alt_names:
if altname is not None:
output.write(''.join(["<",identifier,"> skos:altLabel '''", altname, "''' .\n"]))
for subject in subjects:
if subject is not None:
output.write(''.join(["<",identifier,"> edm:isRelatedTo '''", subject, "''' .\n"]))
for nationality in nationalities:
if nationality is not None:
output.write(''.join(["<",identifier,"> schema:nationality '''", nationality, "''' .\n"]))
for language in languages:
if language is not None:
output.write(''.join(["<",identifier,"> rdaGr2:languageOfThePerson '''", language, "''' .\n"]))
for place in places:
if place is not None:
output.write(''.join(["<",identifier,"> edm:hasMet <", place, "> .\n"]))
for occupation in occupations:
if occupation is not None:
output.write(''.join(["<",identifier,"> rdaGr2:professionOrOccupation '''", occupation, "''' .\n"]))
for assn in associated:
if assn is not None:
output.write(''.join(["<",identifier,"> edm:hasMet <", assn , "> .\n"]))
for corr in corresponded:
if corr is not None:
output.write(''.join(["<",identifier,"> edm:hasMet <", corr, "> .\n"]))
for same in sameas:
if same is not None:
output.write(''.join(["<",identifier,"> owl:sameAs <", same, "> .\n"]))
for doc in referencedin:
if doc is not None:
output.write(''.join(["<",identifier,"> dcterms:isReferencedBy <", doc, "> .\n"]))
for doc in creatorof:
if doc is not None:
output.write(''.join(["<",doc,"> dc:creator <", identifier, "> .\n"]))