-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_pacs.py
93 lines (85 loc) · 3.52 KB
/
build_pacs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import urllib2
from BeautifulSoup import BeautifulSoup as BS
import re
import yaml
PACS_REGEX = re.compile(r"[0-9]{2}\.[0-9]{2}\.[a-zA-Z\-\+][a-zA-Z\-]")
PACS_REGEX_WEAK = re.compile(r"[0-9]{2}\.([0-9]{2}\.[a-zA-Z\-\+][a-zA-Z\-])?")
def open_http(url):
"""Tries to open an http url. Raises an error if the request fails."""
# Can form POST requests and add headers here
req = urllib2.Request(url)
# Make request (request, data - do not use, timeout in seconds)
result = urllib2.urlopen(req, None, 10)
return result
def pacs_get_level(code):
if code[1] == '0': return 1
if len(code) == 3: return 2
assert len(code) == 8
if code[6] in ('+','-'): return 3
if code[6].isupper(): return 4
return 5
def get_node(key):
# The key should be the PACS code
return nodes.setdefault(key, {'code': key, 'level': pacs_get_level(key), 'cross_references': [], 'cross_referenced': [], 'cross_parents': [], 'children': [], 'cross_children': [], 'year_s': '2010'})
try:
f = open('pacs.yml')
except:
nodes = {}
else:
nodes = yaml.load(f.read())
f.close()
urls = [("http://www.aip.org/pacs/pacs2010/individuals/pacs2010_regular_edition/reg%d0.htm" % i,'main') for i in range(0,10)]
urls = urls[0:]
for url in urls:
res = ''.join(open_http(url[0]).readlines())
soup = BS(res)
rows = soup.findAll('tr')
cur_node = None
for r in rows:
try:
html_cells = r.findAll('td')
if len(html_cells) != 2: continue
cells = map(lambda x: str(''.join(x.findAll(text=True))), html_cells)
if "... ... ..." in cells[0]:
pacs_match = re.search(PACS_REGEX, cells[1])
assert pacs_match
get_node(pacs_match.group(0))['cross_parents'].append(cur_node['code'])
cur_node['cross_children'].append(pacs_match.group(0))
continue
else:
out = {}
code = re.search(PACS_REGEX_WEAK, cells[0]).group(0)
out = get_node(code)
out['source'] = url[1]
name = cells[1].rstrip()
description = ''
#If first italics characters is ( or [, then assume this is a comment
try:
comment_candidate_split = html_cells[1].findAll('i')[-1].findAll(text = True)
if comment_candidate_split[0][0] in ('(','['):
n = len(comment_candidate_split)
description = str(''.join(comment_candidate_split))
name = str(''.join(html_cells[1].findAll(text = True)[0:-1 * n])).rstrip()
except IndexError: pass
assert name
out['name'] = name
out['description'] = description
for x in re.findall(PACS_REGEX, cells[1]):
y = get_node(x)
y['cross_referenced'].append(code)
out['cross_references'].append(x)
out['parent'] = None
while cur_node:
if out['level'] > cur_node['level']:
assert out['level'] == cur_node['level'] + 1
out['parent'] = cur_node['code']
cur_node['children'].append(code)
break
else:
cur_node = nodes[cur_node['parent']]
cur_node = out
except:
print "ERROR: %s" % r
f = open('pacs.yml','w')
f.write(yaml.dump(nodes))
f.close()