This repository has been archived by the owner on Aug 20, 2023. It is now read-only.
forked from aih/uscites
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautoparser.py
115 lines (95 loc) · 4.32 KB
/
autoparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/python<2.7>
# -*- coding: utf-8 -*-
import string, os, sys
from splithtml import *
import codecs
try:
import re2 as re
except:
import re
regfile = os.path.abspath(os.path.dirname(__file__))+'/usc-regex.txt'
divider="<!-- documentid:"
_DEBUG = False
# To parse the html in data/, documentid:26_12 = 26usc12
def makergxlist(regfile):
rfile = codecs.open(regfile, 'r', 'utf-8')
findreplace = []
for eachline in rfile.readlines():
if (not eachline.isspace()) & (eachline[0] <> '#'):
pattern_replace = unicode(eachline).strip().split('#')
findreplace.append(pattern_replace)
rfile.close()
findreplace = [(re.compile(pattern, re.U|re.M), replacement) for pattern, replacement in findreplace]
return findreplace
def subfile(inputfile, findreplace):
outtext = inputfile
if inputfile is None:
return outtext
for counter, pattern_replace in enumerate(findreplace):
try:
outtext, subs = re.subn(pattern_replace[0], pattern_replace[1], outtext)
if _DEBUG and subs > 0:
print pattern_replace[0].pattern
print pattern_replace[1]
print "=" * 30
except KeyboardInterrupt:
print counter, ' substituted: ', subs
print pattern_replace[0], "||||", pattern_replace[1], outtext
return outtext
# Dynamically replace references with links
def parsesections(pattern, pattern_replace, section):
sectionsref = re.search(pattern, section)
while sectionsref:
i1 = sectionsref.start(1)
i2 = sectionsref.end(2)
#print "found multiple secs at", i1, "-", i2
section = section[:i1]+re.sub(pattern_replace[0], pattern_replace[1] % sectionsref.group(2), section[i1:i2]) + section[1+i2:]
sectionsref = re.search(pattern, section)
return section
def parsenamedacts(pattern, intext):
namedacts = re.findall(pattern, intext)
namedacts = list(set(namedacts))
outtext = intext
for namedact in namedacts:
#outtext = outtext.replace(namedact+r'@/', encode_act(namedact)+r'@/')
outtext = outtext.replace(r'ref-namedact-'+namedact,r'ref-namedact-'+encode_act(namedact))
return outtext
def encode_act(namedact):
#encoding = namedact.translate(None, string.punctuation) # removed because it is fails with unicode objects
encoding = ''.join([char if char not in string.punctuation else '' for char in namedact])
encodelen = str(len(encoding))
encoding = [char if char not in string.lowercase+" " else '' for char in encoding] #[encoding.translate(None, string.lowercase + r' ')]
encoding.append('-' + encodelen)
out = "".join(encoding)
return out
# Loop through sections with various parsing passes
def parsers(uscfiles, findreplace):
parsedfiles = []
for counter, section in enumerate(uscfiles):
#print ""
#print "File", counter
parsedfile = subfile(section, findreplace)
#print parsedfile
# Replace Multiple Section references with links
# Include [^<] to make sure no group is transformed twice
pattern = r'@@@\s[Ss]ections?\s([^<]*?)@@@@@(.*?)@@'
pattern_replace = [r'%s' % u'(\d+\w*(?:\(\w+\))*[-|–]?\d*)([, @])', r'<a href="/laws/target/%s/\1" class="sec">\1</a>\2']
parsedfile = parsesections(pattern, pattern_replace, parsedfile)
#parsedfile = re.sub(r'@of-ref@', r'ref-Title-'+title, parsedfile)
parsedfile = re.sub(r'@@ref-.*?@', r'', parsedfile)
#parsedfile = re.sub(r'ref-title-this', r'ref-title-'+title, parsedfile)
# Encode Named Acts by removing lowercase and non-word characters, and appending the length of the name w/o non-word characters
#pattern = r'@@ref-namedact-(.*?)@@'
pattern = r'/ref-namedact-(.*?)/'
parsedfile = parsenamedacts(pattern, parsedfile)
parsedfile = re.sub(r'@of-ref@', r'ref-title-this', parsedfile)
parsedfile = re.sub(r'@@ref-.*?@', r'', parsedfile)
# Remove remaining @
parsedfile = parsedfile.replace('@','')#.translate(None, '@')
parsedfiles.append(parsedfile)
return parsedfiles
def parse(data=False):
uscfiles = [data, ]#.split(divider)#splithtml()
findreplace = makergxlist(regfile)
parsedfiles = parsers(uscfiles, findreplace)
return parsedfiles