forked from jheitzeb/ruby-gpt3-commander
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_cleaner.rb
433 lines (383 loc) · 12.2 KB
/
html_cleaner.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
# This class contains utilities to simplify HTML for use in size-limited GPT-3 prompts
# without sacrificing the meaning of the HTML.
# Why? Puppeteer surfs and pulls the HTML from pages it visits but the HTML can be quite verbose.
class HtmlCleaner
BASIC_ELEMENTS = [ "p", "br", "span", "div", "td" ]
LINKABLES = [
"a",
"link",
"button",
"btn"
]
SIMPLER_ELEMENT_NAMES = {
"a" => "link",
"anchor" => "link",
}
CLASS_RENAMER = {
"title" => "section",
"btn" => "button",
}
# Some common classes that carry meaning (hence GPT-3 processing will care about) and should be preserved.
CLASS_WHITELIST = [
"button",
"btn",
"link",
"input",
"strikethrough",
"title",
"rank",
"priority",
"star",
"rating",
"review",
"score",
"price",
"cost",
"menu",
"user",
"date",
"time",
"page",
"age",
"month",
"day",
"year",
"type",
"category",
"kind",
"offer",
"promo",
"sale",
"cart",
"add",
"image",
"email",
"street",
"city",
"cities",
"zip",
"postal",
"country",
"reservation",
"availability",
"quantity",
"inventory",
"product",
"sku",
"notify",
"share",
"important",
"comment",
"article",
"venue",
"location",
"color",
"footer",
"skip",
"next",
"previous",
"cuisine",
"neighborhood",
]
def self.simpler_element_name(element_name)
if SIMPLER_ELEMENT_NAMES.key?(element_name.downcase)
return SIMPLER_ELEMENT_NAMES[element_name.downcase].downcase
else
return element_name.downcase
end
end
def self.simpler_class_name(class_name)
if CLASS_RENAMER.key?(class_name.downcase)
return CLASS_RENAMER[class_name.downcase].downcase
else
return class_name.downcase
end
end
def self.clean_classes(node)
pclass = node['class']
keeper_classes = []
if pclass.present?
classes = pclass.to_s.split(" ")
# are any of classes (downcased) contained in any of LINKABLES?
found = classes.select {|class_name| LINKABLES.include?(class_name.downcase)}
if found.blank?
# if none of these are LINKABLES, then go up the tree looking for them, and add if found.
go_up = node.parent
go_up_classes = go_up['class'].to_s.split(" ")
while go_up.present?
go_up_name = go_up.name.downcase
if LINKABLES.include?(go_up_name)
if go_up_name == "a"
go_up_name = "link"
end
keeper_classes = [go_up_name] + keeper_classes
break
end
found = go_up_classes.select {|class_name| LINKABLES.include?(class_name.downcase)}
if found.present?
found.each do |found_class|
if found_class == "a"
found_class = "link"
end
keeper_classes = [found_class] + keeper_classes
end
break
end
if node.parent == go_up
break
end
go_up = node.parent
end
end
# only keep classes if they contain (as a substring) any of the words in the CLASS_WHITELIST
# and only keep the whitelist verion of the class (which is simpler / more compact semantics)
search_classes = CLASS_WHITELIST.dup
classes.each do |class_name|
search_classes.each do |wl_class|
if class_name.downcase.include?(wl_class.downcase)
keeper_classes << simpler_class_name(wl_class.downcase)
search_classes.delete(wl_class)
end
end
end
keeper_classes.uniq!
end
return keeper_classes
end
def self.clean_html(raw_html, page_title: nil, page_url: nil)
html = raw_html.encode('UTF-8', invalid: :replace, undef: :replace, replace: '', universal_newline: false).gsub(/\P{ASCII}/, '')
parser = Nokogiri::HTML(html, nil, Encoding::UTF_8.to_s)
parser.xpath('//script')&.remove
parser.xpath('//style')&.remove
# Build the new doc as we go.
nodes_processed = []
# parse the HTML into nodes
# and build into a tree of depth=2 where parents have children
# and the parents are in order.
# First, get all the leaf nodes in order.
leaf_nodes = []
parser.xpath('//*[not(*)]').each do |node|
leaf_nodes << node
end
# Next, go through getting parents (in order) and build a data structure
# that will store the 1:n relation of parent to child/leaf.
parent_hashses = []
leaf_nodes.each do |node|
parent = node.parent
if parent.present?
# Find that parent in the parent_hashses
parent_index = parent_hashses.index { |h| h[:parent] == parent }
if parent_index.present?
# add this child to the parent array
parent_hashses[parent_index][:children] << node
else
# create a new parent hash
parent_hashses << { parent: parent, children: [node] }
end
end
end
# Finally, go through and BUILD HTML:
build_html = []
parent_hashses.each do |parent_hash|
parent = parent_hash[:parent]
children = parent_hash[:children]
formatted = format_parent_and_chilren(parent, children)
if formatted.present?
build_html << formatted
end
end
# Add a metatag at the top of the URL
if page_title.present? && page_url.present?
build_html.unshift("<meta name='og:title' content='#{page_title}' />")
build_html.unshift("<meta name='og:url' content='#{page_url}' />")
end
# Print a few lines of the HTML for debugging purposes:
debug = false
if debug
puts " - - - - - - - - - ".white.bold
build_html.first(50).each do |line|
puts " " + line.white
end
puts " - - - - - - - - - ".white.bold
puts ""
end
# Return a complete list of all classes in the original HTML
# and the new HTML.
original_classes = []
parser.xpath('//*').each do |node|
if node.attributes["class"].present?
class_string = node.attributes["class"].value
class_string.split(" ").each do |c|
original_classes << c
end
end
end
build_html.join("\n")
end
def self.format_parent_and_chilren(parent, children)
node_html = ""
if children.count > 1
keeper_classes = clean_classes(parent)
# Remove any classes that are equal to the element name.
# <link class='link'>Top Rated</link> --> <link>Top Rated</link>
keeper_classes = keeper_classes.reject { |c| c.downcase == parent.name.downcase }
parent_name = simpler_element_name(parent.name)
needs_parent = true
if keeper_classes.blank?
if parent_name == "p" || parent_name == "br" || parent_name == "div" || parent_name == "span"
needs_parent = false
else
node_html << "<#{parent_name}>"
end
else
node_html << "<#{parent_name} class='#{keeper_classes.join(' ')}'>"
end
children_html = ""
children.each do |child|
child_html = format_child_node(child)
if child_html.present?
child_html = "\n " + child_html
children_html << child_html
end
end
if children_html.blank?
return ""
else
node_html << " #{children_html}"
end
if needs_parent
node_html << "\n</#{parent_name}>"
end
else
child_html = format_child_node(children.first)
if child_html.present?
node_html << child_html
end
end
node_html
end
# Take a single parsed HTML node and reformat to something simpler.
def self.format_child_node(node)
element = simpler_element_name(node.name).downcase
# get the immedate text in the node (not children)
text = node.content.strip
# if the text parent has signifiant classes, put them in as they may be hints as to the meaning.
keeper_classes = clean_classes(node)
# Is this needed?
if keeper_classes.present? && element.blank?
element = "p"
end
# If the element is p, br, div or span, "elevate" the first class name to be the element name.
# Examples:
# <p class='score'>228 points</p> --> <score>228 points</score>
# <div class='button time'>6:00PM</div> --> <button class='time'>6:00PM</button>
if keeper_classes.present?
if BASIC_ELEMENTS.include?(element)
element = keeper_classes[0]
keeper_classes.shift # removes the first element
end
end
# Remove any classes that are equal to the element name.
# <link class='link'>Top Rated</link> --> <link>Top Rated</link>
keeper_classes = keeper_classes.reject { |c| c.downcase == element.downcase }
text = text.strip
# if the text only contains ascii chars 32 and 160, then it's just whitespace.
if text.gsub(/[\s\u00A0]/, '').empty?
text = ""
end
formatted = ""
if text.blank?
return ""
end
# If the element is H1, H2, H3, H4, H5, H6, then we don't need the class "section"
hs = ["h1", "h2", "h3", "h4", "h5", "h6"]
if hs.include?(element)
keeper_classes.delete("section")
end
# If the element has no classes and is span | p | br | div | td, then just return the text.
if keeper_classes.blank? && BASIC_ELEMENTS.include?(element)
return text
end
# If the element has no classes and the parent has multiple children, then just return the text.
#if keeper_classes.blank? && node.parent.children.count > 1
# return text
#end
if element.present?
keeper_classes_str = keeper_classes.join(' ')
href = ""
# if the node has href, add it back in to formatted.
if node.attributes["href"].present?
href = node.attributes["href"].value
href = " href='#{href}'"
end
if keeper_classes_str.present?
formatted = "<#{element} class='#{keeper_classes_str}'#{href}>#{text}</#{element}>"
else
formatted = "<#{element}#{href}>#{text}</#{element}>"
end
else
formatted = text
end
return formatted
end
# if HTML is too large to put into the parameter in a prompt,
# we split it so we can run the prompt multiple times and post-process
# the results.
def self.split_for_open_ai(clean_html, prompt, overhead)
open_ai_max_tokens = 2048
open_ai_max_chars = (open_ai_max_tokens * 4).to_i
safe_buffer = 100
max_chars = open_ai_max_chars - overhead.length - prompt.length - safe_buffer
split_html(clean_html, max: max_chars)
end
# Split the HTML into several arrays where the length of each string
# is less than max, and do not split inside of tags.
# OpenAI: or most models this is 2048 tokens, or about 1500 words
# One token is ~4 characters of text for common English text
def self.split_html(clean_html, max: 3900)
if clean_html.length <= max
return [clean_html]
end
# use regex to split by closing tags
# keep the closing tag in results.
# instead of clean_html.split(/<\/[^>]+>/), use: ?= operator
groups = clean_html.split(/(?=<\/[^>]+>)/)
ar_parts = []
cur_part_len = 0
next_chunk = []
groups.each do |bits, i|
cur_part_len = cur_part_len + bits.length
if cur_part_len < max
next_chunk << bits
else
ar_parts << next_chunk
next_chunk = []
next_chunk << bits
cur_part_len = bits.length
end
end
if !ar_parts.include?(next_chunk)
ar_parts << next_chunk
end
# Join all the parts
parts = []
ar_parts.each do |sub_array|
parts << sub_array.join(" ")
end
# Pull out the meta name='og:url' and meta name='og:title' tags.
parsed = Nokogiri::HTML(clean_html)
og_url = parsed.css("meta[name='og:url']").first.try(:attr, "content")
og_title = parsed.css("meta[name='og:title']").first.try(:attr, "content")
# Add a meta page "page 1 of 2" to each part.
# And the og:url and og:title to pages 2+
total_parts = parts.length
parts = parts.map.with_index do |part, i|
if i >= 1
part = "<meta name=\"og:url\" content=\"#{og_url}\">\n" + part
part = "<meta name=\"og:title\" content=\"#{og_title}\">\n" + part
end
part = "<meta name=\"page\" content=\"#{i+1} of #{total_parts}\">\n" + part
end
parts
end
end