forked from Charcoal-SE/SmokeDetector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelpers.py
591 lines (475 loc) · 21.3 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
# coding=utf-8
import os
import sys
import traceback
from datetime import datetime, timezone
import time
import importlib
import threading
# termcolor doesn't work properly in PowerShell or cmd on Windows, so use colorama.
import platform
platform_text = platform.platform().lower()
if 'windows' in platform_text and 'cygwin' not in platform_text:
from colorama import init as colorama_init
colorama_init()
from glob import glob
import sqlite3
from urllib.parse import quote, quote_plus
from threading import Thread
from termcolor import colored
import requests
import regex
from regex.regex import _compile as regex_raw_compile
import websocket
from globalvars import GlobalVars
def exit_mode(*args, code=0):
# This code is executed whenever we exit, even when the exit is caused by an exception.
args = set(args)
if not (args & {'standby', 'no_standby'}):
standby = 'standby' if GlobalVars.standby_mode else 'no_standby'
args.add(standby)
with open("exit.txt", "w", encoding="utf-8") as f:
print("\n".join(args), file=f)
log('debug', 'Exiting with args: {}'.format(', '.join(args) or 'None'))
# Flush any buffered queue timing data
import datahandling # this must not be a top-level import in order to avoid a circular import
try:
datahandling.flush_queue_timings_data()
except Exception:
log_current_exception()
try:
datahandling.store_post_scan_stats()
except Exception:
log_current_exception()
try:
datahandling.store_recently_scanned_posts()
except Exception:
log_current_exception()
# Store other pickles as we exit
try:
if GlobalVars.edit_watcher:
GlobalVars.edit_watcher.save()
except Exception:
log_current_exception()
try:
if GlobalVars.deletion_watcher:
GlobalVars.deletion_watcher.save()
except Exception:
log_current_exception()
# We have to use '_exit' here, because 'sys.exit' only exits the current
# thread (not the current process). Unfortunately, this results in
# 'atexit' handlers not being called. All exit calls in SmokeDetector go
# through this function, so any necessary cleanup can happen here (though
# keep in mind that this function isn't called when terminating due to a
# Ctrl-C or other signal).
os._exit(code)
class ErrorLogs:
DB_FILE = "errorLogs.db"
# SQLite threading limitation !?!?!?
db = sqlite3.connect(DB_FILE)
if db.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='error_logs'").fetchone() is None:
# Table 'error_logs' doesn't exist
try:
db.execute("CREATE TABLE error_logs (time REAL PRIMARY KEY ASC, classname TEXT, message TEXT,"
" traceback TEXT)")
db.commit()
except (sqlite3.OperationalError):
# In CI testing, it's possible for the table to be created in a different thread between when
# we first test for the table's existence and when we try to create the table.
if db.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='error_logs'").fetchone() is None:
# Table 'error_logs' still doesn't exist
raise
db.close()
db_conns = {}
@classmethod
def get_db(cls):
thread_id = threading.get_ident()
if thread_id not in cls.db_conns:
cls.db_conns[thread_id] = sqlite3.connect(cls.DB_FILE)
return cls.db_conns[thread_id]
@classmethod
def add_current_exception(cls):
now = datetime.utcnow()
exctype, value, traceback_or_message = sys.exc_info()
tr = get_traceback_from_traceback_or_message(traceback_or_message)
cls.add(now.timestamp(), exctype.__name__, str(value), tr)
@classmethod
def add(cls, time, classname, message, traceback):
classname = redact_passwords(classname)
message = redact_passwords(message)
traceback = redact_passwords(traceback)
db = cls.get_db()
db.execute("INSERT INTO error_logs VALUES (?, ?, ?, ?)",
(time, classname, message, traceback))
db.commit()
@classmethod
def fetch_last(cls, n):
db = cls.get_db()
cursor = db.execute("SELECT * FROM error_logs ORDER BY time DESC LIMIT ?", (int(n),))
data = cursor.fetchall()
return data
@classmethod
def truncate(cls, n=100):
"""
Truncate the DB and keep only N latest exceptions
"""
db = cls.get_db()
cursor = db.execute(
"DELETE FROM error_logs WHERE time IN "
"(SELECT time FROM error_logs ORDER BY time DESC LIMIT -1 OFFSET ?)", (int(n),))
db.commit()
data = cursor.fetchall()
return data
class Helpers:
min_log_level = 0
def escape_format(s):
return s.replace("{", "{{").replace("}", "}}")
def expand_shorthand_link(s):
s = s.lower()
if s.endswith("so"):
s = s[:-2] + "stackoverflow.com"
elif s.endswith("se"):
s = s[:-2] + "stackexchange.com"
elif s.endswith("su"):
s = s[:-2] + "superuser.com"
elif s.endswith("sf"):
s = s[:-2] + "serverfault.com"
elif s.endswith("au"):
s = s[:-2] + "askubuntu.com"
return s
def redact_text(text, redact_str, replace_with):
if redact_str:
return text.replace(redact_str, replace_with) \
.replace(quote(redact_str), replace_with) \
.replace(quote_plus(redact_str), replace_with)
return text
def redact_passwords(value):
value = str(value)
# Generic redaction of URLs with http, https, and ftp schemes
value = regex.sub(r"((?:https?|ftp):\/\/)[^@:\/]*:[^@:\/]*(?=@)", r"\1[REDACTED URL USERNAME AND PASSWORD]", value)
# In case these are somewhere else.
value = redact_text(value, GlobalVars.github_password, "[GITHUB PASSWORD REDACTED]")
value = redact_text(value, GlobalVars.github_access_token, "[GITHUB ACCESS TOKEN REDACTED]")
value = redact_text(value, GlobalVars.chatexchange_p, "[CHAT PASSWORD REDACTED]")
value = redact_text(value, GlobalVars.metasmoke_key, "[METASMOKE KEY REDACTED]")
value = redact_text(value, GlobalVars.perspective_key, "[PERSPECTIVE KEY REDACTED]")
return value
# noinspection PyMissingTypeHints
def log(log_level, *args, and_file=False, no_exception=False):
levels = {
'debug': [0, 'grey'],
'info': [1, 'cyan'],
'warning': [2, 'yellow'],
'warn': [2, 'yellow'],
'error': [3, 'red']
}
level = levels[log_level][0]
if level < Helpers.min_log_level:
return
color = levels[log_level][1] if log_level in levels else 'white'
log_str = "{} {}".format(colored("[{}]".format(datetime.utcnow().isoformat()[11:-3]),
color, attrs=['bold']),
redact_passwords(" ".join([str(x) for x in args])))
print(log_str, file=sys.stderr)
if level == 3 and not no_exception:
exc_tb = sys.exc_info()[2]
print(redact_passwords("".join(traceback.format_tb(exc_tb))), file=sys.stderr)
if and_file: # Also to file
log_file(log_level, *args)
def log_file(log_level, *args):
levels = {
'debug': 0,
'info': 1,
'warning': 2,
'error': 3,
}
if levels[log_level] < Helpers.min_log_level:
return
log_str = redact_passwords("[{}] {}: {}".format(datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
log_level.upper(), " ".join([str(x) for x in args])))
with open("errorLogs.txt", "a", encoding="utf-8") as f:
print(log_str, file=f)
def get_traceback_from_traceback_or_message(traceback_or_message):
if isinstance(traceback_or_message, str):
return traceback_or_message
else:
return ''.join(traceback.format_tb(traceback_or_message))
def log_exception(exctype, value, traceback_or_message, and_file=False, *, log_level=None):
log_level = 'error' if log_level is None else log_level
now = datetime.utcnow()
tr = get_traceback_from_traceback_or_message(traceback_or_message)
exception_only = ''.join(traceback.format_exception_only(exctype, value)).strip()
logged_msg = "{exception}\n{now} UTC\n{row}\n\n".format(exception=exception_only, now=now, row=tr)
# Redacting passwords happens in log() and ErrorLogs.add().
log(log_level, logged_msg, and_file=and_file)
ErrorLogs.add(now.timestamp(), exctype.__name__, str(value), tr)
def log_current_exception(and_file=False, log_level=None):
log_exception(*sys.exc_info(), and_file=and_file, log_level=log_level)
def log_current_thread(log_level, prefix="", postfix=""):
if prefix:
prefix += '\t'
if postfix:
postfix = '\t' + postfix
current_thread = threading.current_thread()
log(log_level, "{}current thread: {}: {}{}".format(prefix, current_thread.name, current_thread.ident, postfix))
def append_to_current_thread_name(text):
threading.current_thread().name += text
def files_changed(diff, file_set):
changed = set(diff.split())
return bool(len(changed & file_set))
core_files = {
"apigetpost.py",
"blacklists.py",
"bodyfetcher.py",
"chatcommands.py",
"chatcommunicate.py",
"chatexchange_extension.py",
"datahandling.py",
"deletionwatcher.py",
"editwatcher.py",
"excepthook.py",
"flovis.py",
"gitmanager.py",
"globalvars.py",
"helpers.py",
"metasmoke.py",
"metasmoke_cache.py",
"nocrash.py",
"number_homoglyphs.py",
"parsing.py",
"phone_numbers.py",
"queue_timings.py",
"recently_scanned_posts.py",
"socketscience.py",
"spamhandling.py",
"tasks.py",
"ws.py",
"classes/feedback.py",
"_Git_Windows.py",
"classes/__init__.py",
"classes/_Post.py",
# Before these are made reloadable
"rooms.yml",
}
reloadable_modules = {
"findspam.py",
}
module_files = core_files | reloadable_modules
def only_blacklists_changed(diff):
return not files_changed(diff, module_files)
def only_modules_changed(diff):
return not files_changed(diff, core_files)
def reload_modules():
result = True
for s in reloadable_modules:
s = s.replace(".py", "") # Relying on our naming convention
try:
# Some reliable approach
importlib.reload(sys.modules[s])
except (KeyError, ImportError):
result = False
return result
def unshorten_link(url, request_type='GET', depth=10, timeout=15):
orig_url = url
response_code = 301
# We use a valid Firefox User-Agent, because some services don't provide the forward when we tell them the
# request is from Smokedetector (e.g., cloudflare.com, which is used, as of 2024-04-12, by https://t.ly, which
# is one of the domains which we test in our CI testing).
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0'}
for tries in range(depth):
if response_code not in {301, 302, 303, 307, 308}:
break
res = requests.request(request_type, url, headers=headers, stream=True, allow_redirects=False, timeout=timeout)
res.connection.close() # Discard response body for GET requests
response_code = res.status_code
if 'Location' not in res.headers:
# No more redirects, stop
break
url = res.headers['Location']
else:
raise ValueError("Too many redirects ({}) for URL {!r}".format(depth, orig_url))
return url
pcre_comment = regex.compile(r"\(\?#(?<!(?:[^\\]|^)(?:\\\\)*\\\(\?#)[^)]*\)")
def blacklist_integrity_check():
bl_files = glob('bad_*.txt') + glob('blacklisted_*.txt') + glob('watched_*.txt')
seen = dict()
errors = []
city_list = ['test']
regex.cache_all(False)
previous_bl_file = ''
for bl_file in bl_files:
print('bl_file:', bl_file)
with open(bl_file, 'r', encoding="utf-8") as lines:
try:
previous_line = ''
previous_line_number = -1
for lineno, line in enumerate(lines, 1):
if line.endswith('\r\n'):
errors.append('{0}:{1}:DOS line ending'.format(bl_file, lineno))
elif not line.endswith('\n'):
errors.append('{0}:{1}:No newline'.format(bl_file, lineno))
elif line == '\n':
errors.append('{0}:{1}:Empty line'.format(bl_file, lineno))
elif bl_file.startswith('watched_'):
line = line.split('\t')[2]
if 'numbers' not in bl_file:
try:
regex.compile(line, regex.UNICODE, city=city_list, ignore_unused=True)
except Exception:
(exctype, value, traceback_or_message) = sys.exc_info()
exception_only = ''.join(traceback.format_exception_only(exctype, value)).strip()
errors.append("{0}:{1}:Regex fails to compile:r'''{2}''':{3}".format(bl_file, lineno,
line.rstrip('\n'),
exception_only))
line = pcre_comment.sub("", line)
if line in seen:
errors.append('{0}:{1}:Duplicate entry {2} (also {3})'.format(
bl_file, lineno, line.rstrip('\n'), seen[line]))
else:
seen[line] = '{0}:{1}'.format(bl_file, lineno)
previous_line = line
previous_line_number = lineno
previous_bl_file = bl_file
except Exception:
# Some output to helps localize the problem
if previous_line_number == -1:
print('Exception raised when started reading: {}'.format(bl_file))
print('End of previous file: line {} of {}:{}'.format(lineno, previous_bl_file, line))
else:
print('previous: line {} of {}:{}'.format(previous_line_number, previous_bl_file, previous_line))
print('line {} of {}:{}'.format(lineno, bl_file, line))
raise
regex.cache_all(True)
return errors
def chunk_list(list_in, chunk_size):
"""
Split a list into chunks.
"""
return [list_in[i:i + chunk_size] for i in range(0, len(list_in), chunk_size)]
class SecurityError(Exception):
pass
def not_regex_search_ascii_and_unicode(regex_dict, test_text):
return not regex_dict['ascii'].search(test_text) and not regex_dict['unicode'].search(test_text)
def remove_regex_comments(regex_text):
return regex.sub(r"(?<!\\)\(\?\#[^\)]*\)", "", regex_text)
def remove_end_regex_comments(regex_text):
return regex.sub(r"(?:(?<!\\)\(\?\#[^\)]*\))+$", "", regex_text)
def get_only_digits(text):
return regex.sub(r"(?a)\D", "", text)
def add_to_global_bodyfetcher_queue_in_new_thread(hostname, question_id, should_check_site=False, source=None):
source_text = ""
if source:
source_text = " from {}".format(source)
t = Thread(name="bodyfetcher post enqueuing: {}/{}{}".format(hostname, question_id, source_text),
target=GlobalVars.bodyfetcher.add_to_queue,
args=(hostname, question_id, should_check_site, source))
t.start()
CONVERT_NEW_SCAN_TO_SPAM_DEFAULT_IGNORED_REASONS = set([
"blacklisted user",
])
def convert_new_scan_to_spam_result_if_new_reasons(new_info, old_info, match_ignore=None, ignored_reasons=True):
if type(old_info) is dict:
# This is for recently scanned posts, which pass the recently scanned posts entry here,
# which is a dict.
old_is_spam = old_info.get('is_spam', None)
old_reasons = old_info.get('reasons', None)
old_why = old_info.get('why', None)
elif type(old_info) is tuple:
old_is_spam, old_reasons, old_why = old_info
if not old_is_spam:
return new_info
new_is_spam, new_reasons, new_why = new_info
if new_is_spam:
return new_info
if type(new_reasons) is tuple:
# The scan was actually spam, but was declared non-spam for some reason external to the content.
# For example, that it was recently reported.
actual_new_reasons, actual_new_why = new_reasons
else:
# The new results did not actually indicate it was spam.
return new_info
if match_ignore is not None and new_why not in match_ignore:
# We only want it to be considered spam if ignored for specified reasons.
return new_info
ignored_reasons_set = set()
if ignored_reasons:
if ignored_reasons is True:
ignored_reasons_set = CONVERT_NEW_SCAN_TO_SPAM_DEFAULT_IGNORED_REASONS
else:
ignored_reasons_set = set(ignored_reasons)
actual_new_reasons_set = set(actual_new_reasons) - ignored_reasons_set
old_reasons_set = set(old_reasons) - ignored_reasons_set
if len(actual_new_reasons_set) > len(old_reasons_set) or not actual_new_reasons_set.issubset(old_reasons_set):
# There are new reasons the post would have been reported
return (True, actual_new_reasons, actual_new_why)
return new_info
def regex_compile_no_cache(regex_text, flags=0, ignore_unused=False, **kwargs):
return regex_raw_compile(regex_text, flags, ignore_unused, kwargs, False)
def color(text, color, attrs=None):
return colored(text, color, attrs=attrs)
def strip_code_elements(text, leave_note=False):
return regex.sub("(?s)<code>.*?</code>", "\nstripped code\n" if leave_note else "", text)
def strip_pre_elements(text, leave_note=False):
return regex.sub("(?s)<pre>.*?</pre>", "\nstripped pre\n" if leave_note else "", text)
def strip_pre_and_code_elements(text, leave_note=False):
return strip_code_elements(strip_pre_elements(text, leave_note=leave_note), leave_note=leave_note)
def pluralize(value, base, plural_end, single_end='', zero_is_plural=True):
return base + plural_end if value > 1 or (zero_is_plural and value == 0) else base + single_end
def get_se_api_default_params(params):
all_params = GlobalVars.se_api_default_params.copy()
all_params.update(params)
return all_params
def get_se_api_default_params_questions_answers_posts(params):
all_params = GlobalVars.se_api_default_params_questions_answers_posts.copy()
all_params.update(params)
return all_params
def get_se_api_default_params_questions_answers_posts_add_site(site):
all_params = GlobalVars.se_api_default_params_questions_answers_posts.copy()
all_params.update({'site': site})
return all_params
def get_se_api_url_for_route(route):
return GlobalVars.se_api_url_base + route
def tell_debug_rooms_recovered_websocket(which_ws, exception, connect_time, hb_time):
from chatcommunicate import tell_rooms_with
current_time = time.time()
exception_only = ''.join(traceback.format_exception_only(type(exception), exception)).strip()
exception_message = "{}: {} WebSocket: Recovered from `{}`".format(GlobalVars.location, which_ws, exception_only)
exception_message += '.' if regex.search(r'\w`$', exception_message) is not None else ''
elapsed_from_connect = current_time - connect_time
time_from_connect_message = " {:.1f} seconds after connection;".format(elapsed_from_connect)
if hb_time:
elapsed_from_hb = current_time - hb_time
time_from_hb_message = " {:.1f} seconds after heartbeat.".format(elapsed_from_hb)
else:
time_from_hb_message = " No heartbeats have been received on the current WebSocket."
timestamp = "[{} UTC]: ".format(datetime.now(timezone.utc).isoformat()[0:19])
message_without_timestamp = exception_message + time_from_connect_message + time_from_hb_message
log('debug', message_without_timestamp)
tell_rooms_with('debugwebsocket', timestamp + message_without_timestamp)
def recover_websocket(which_ws, ws, exception, connect_time, hb_time):
log_current_exception(log_level="warning")
if ws:
ws.close() # Close the socket, if it's not already closed
ws = None
try:
ws = websocket.create_connection(GlobalVars.se_websocket_url, timeout=GlobalVars.se_websocket_timeout)
tell_debug_rooms_recovered_websocket(which_ws, exception, connect_time, hb_time)
return ws
except websocket.WebSocketException:
log('error', '{} failed to recover from a WebSocketException.'.format(which_ws))
log_current_exception()
raise
# See PR 2322 for the reason for (?:^|\b) and (?:\b|$)
# (?w:\b) is also useful
KEYWORD_BOOKENDING_START = r"(?is)(?:^|\b|(?w:\b))(*PRUNE)"
KEYWORD_BOOKENDING_END = r"(?:\b|(?w:\b)|$)"
KEYWORD_NON_BOOKENDING_START = r"(?i)"
KEYWORD_NON_BOOKENDING_END = r""
def keyword_bookend_regex_text(regex_text):
return r"{}(?:{}){}".format(KEYWORD_BOOKENDING_START, regex_text, KEYWORD_BOOKENDING_END)
def get_bookended_keyword_regex_text_from_entries(entries):
return keyword_bookend_regex_text('|'.join(entries))
def keyword_non_bookend_regex_text(regex_text):
return r"{}(?:{}){}".format(KEYWORD_NON_BOOKENDING_START, regex_text, KEYWORD_NON_BOOKENDING_END)
def get_non_bookended_keyword_regex_text_from_entries(entries):
return keyword_non_bookend_regex_text('|'.join(entries))