From cedf0b9b93c27da63e886f6e549e2abe495dc5de Mon Sep 17 00:00:00 2001 From: Jacob Coffee Date: Mon, 22 Jul 2024 18:26:30 -0500 Subject: [PATCH] style(optional): apply basic ruff linting and formatting --- code/planet-cache.py | 106 +- code/planet.py | 62 +- code/planet/__init__.py | 424 ++-- code/planet/atomstyler.py | 245 +- code/planet/cache.py | 84 +- code/planet/compat_logging/__init__.py | 664 +++-- code/planet/compat_logging/config.py | 152 +- code/planet/compat_logging/handlers.py | 465 ++-- code/planet/feedparser.py | 3199 ++++++++++++++---------- code/planet/sanitize.py | 690 +++-- config/sort-ini.py | 42 +- 11 files changed, 3468 insertions(+), 2665 deletions(-) diff --git a/code/planet-cache.py b/code/planet-cache.py index 31cedd0..6b0e892 100755 --- a/code/planet-cache.py +++ b/code/planet-cache.py @@ -1,64 +1,41 @@ #!/usr/bin/env python3 -"""Planet cache tool. +"""Planet cache tool.""" -""" - -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = ["Scott James Remnant ", "Jeff Waugh "] __license__ = "Python" +import configparser +import dbm import os import sys import time -import dbm -import configparser import planet -def usage(): - print("Usage: planet-cache [options] CACHEFILE [ITEMID]...") - print() - print("Examine and modify information in the Planet cache.") - print() - print("Channel Commands:") - print(" -C, --channel Display known information on the channel") - print(" -L, --list List items in the channel") - print(" -K, --keys List all keys found in channel items") - print() - print("Item Commands (need ITEMID):") - print(" -I, --item Display known information about the item(s)") - print(" -H, --hide Mark the item(s) as hidden") - print(" -U, --unhide Mark the item(s) as not hidden") - print() - print("Other Options:") - print(" -h, --help Display this help message and exit") +def usage() -> None: sys.exit(0) -def usage_error(msg, *args): - print(msg, " ".join(args), file=sys.stderr) - print("Perhaps you need --help ?", file=sys.stderr) + +def usage_error(msg, *args) -> None: sys.exit(1) -def print_keys(item, title): + +def print_keys(item, title) -> None: keys = item.keys() keys.sort() - key_len = max([ len(k) for k in keys ]) + max([len(k) for k in keys]) - print(title + ":") for key in keys: - if item.key_type(key) == item.DATE: - value = time.strftime(planet.TIMEFMT_ISO, item[key]) - else: - value = str(item[key]) - print(" %-*s %s" % (key_len, key, fit_str(value, 74 - key_len))) + time.strftime(planet.TIMEFMT_ISO, item[key]) if item.key_type(key) == item.DATE else str(item[key]) + def fit_str(string, length): if len(string) <= length: return string else: - return string[:length-4] + " ..." + return string[: length - 4] + " ..." if __name__ == "__main__": @@ -69,44 +46,43 @@ def fit_str(string, length): command = None for arg in sys.argv[1:]: - if arg == "-h" or arg == "--help": + if arg in ("-h", "--help"): usage() - elif arg == "-C" or arg == "--channel": + elif arg in ("-C", "--channel"): if command is not None: usage_error("Only one command option may be supplied") command = "channel" - elif arg == "-L" or arg == "--list": + elif arg in ("-L", "--list"): if command is not None: usage_error("Only one command option may be supplied") command = "list" - elif arg == "-K" or arg == "--keys": + elif arg in ("-K", "--keys"): if command is not None: usage_error("Only one command option may be supplied") command = "keys" - elif arg == "-I" or arg == "--item": + elif arg in ("-I", "--item"): if command is not None: usage_error("Only one command option may be supplied") command = "item" want_ids = 1 - elif arg == "-H" or arg == "--hide": + elif arg in ("-H", "--hide"): if command is not None: usage_error("Only one command option may be supplied") command = "hide" want_ids = 1 - elif arg == "-U" or arg == "--unhide": + elif arg in ("-U", "--unhide"): if command is not None: usage_error("Only one command option may be supplied") command = "unhide" want_ids = 1 elif arg.startswith("-"): usage_error("Unknown option:", arg) + elif cache_file is None: + cache_file = arg + elif want_ids: + ids.append(arg) else: - if cache_file is None: - cache_file = arg - elif want_ids: - ids.append(arg) - else: - usage_error("Unexpected extra argument:", arg) + usage_error("Unexpected extra argument:", arg) if cache_file is None: usage_error("Missing expected cache filename") @@ -115,13 +91,11 @@ def fit_str(string, length): # Open the cache file directly to get the URL it represents try: - with dbm.open(cache_file, 'r') as db: - url = db[b"url"].decode('utf-8') - except dbm.error as e: - print(f"{cache_file}: {str(e)}", file=sys.stderr) + with dbm.open(cache_file, "r") as db: + url = db[b"url"].decode("utf-8") + except dbm.error: sys.exit(1) except KeyError: - print(f"{cache_file}: Probably not a cache file", file=sys.stderr) sys.exit(1) # Now do it the right way :-) @@ -131,7 +105,6 @@ def fit_str(string, length): for item_id in ids: if not channel.has_item(item_id): - print(item_id + ": Not in channel", file=sys.stderr) sys.exit(1) # Do the user's bidding @@ -141,43 +114,35 @@ def fit_str(string, length): elif command == "item": for item_id in ids: item = channel.get_item(item_id) - print_keys(item, "Item Keys for %s" % item_id) + print_keys(item, f"Item Keys for {item_id}") elif command == "list": - print("Items in Channel:") for item in channel.items(hidden=1, sorted=1): - print(" " + item.id) - print(" " + time.strftime(planet.TIMEFMT_ISO, item.date)) if hasattr(item, "title"): - print(" " + fit_str(item.title, 70)) + pass if hasattr(item, "hidden"): - print(" (hidden)") + pass elif command == "keys": keys = {} for item in channel.items(): - for key in item.keys(): + for key in item: keys[key] = 1 keys = sorted(keys.keys()) - print("Keys used in Channel:") for key in keys: - print(" " + key) - print() - - print("Use --item to output values of particular items.") + pass elif command == "hide": for item_id in ids: item = channel.get_item(item_id) if hasattr(item, "hidden"): - print(item_id + ": Already hidden.") + pass else: item.hidden = "yes" channel.cache_write() - print("Done.") elif command == "unhide": for item_id in ids: @@ -185,7 +150,6 @@ def fit_str(string, length): if hasattr(item, "hidden"): del item.hidden else: - print(item_id + ": Not hidden.") + pass channel.cache_write() - print("Done.") diff --git a/code/planet.py b/code/planet.py index c59b592..1da8bd1 100755 --- a/code/planet.py +++ b/code/planet.py @@ -9,21 +9,19 @@ Requires Python 2.1, recommends 2.3. """ -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = ["Scott James Remnant ", "Jeff Waugh "] __license__ = "Python" -import os -import sys +import configparser import locale +import os import socket -import configparser +import sys from urllib.parse import urljoin import planet - # Default configuration file path CONFIG_FILE = "config.ini" @@ -31,16 +29,15 @@ PLANET_NAME = "Unconfigured Planet" PLANET_LINK = "Unconfigured Planet" PLANET_FEED = None -OWNER_NAME = "Anonymous Coward" +OWNER_NAME = "Anonymous Coward" OWNER_EMAIL = "" -LOG_LEVEL = "WARNING" -FEED_TIMEOUT = 20 # seconds +LOG_LEVEL = "WARNING" +FEED_TIMEOUT = 20 # seconds # Default template file list TEMPLATE_FILES = "examples/basic/planet.html.tmpl" - def config_get(config, section, option, default=None, raw=0, vars=None): """Get a value from the configuration, with a default.""" if config.has_option(section, option): @@ -48,27 +45,20 @@ def config_get(config, section, option, default=None, raw=0, vars=None): else: return default -def main(): + +def main() -> None: config_file = CONFIG_FILE offline = 0 verbose = 0 for arg in sys.argv[1:]: - if arg == "-h" or arg == "--help": - print("Usage: planet [options] [CONFIGFILE]") - print() - print("Options:") - print(" -v, --verbose DEBUG level logging during update") - print(" -o, --offline Update the Planet from the cache only") - print(" -h, --help Display this help message and exit") - print() + if arg in ("-h", "--help"): sys.exit(0) - elif arg == "-v" or arg == "--verbose": + elif arg in ("-v", "--verbose"): verbose = 1 - elif arg == "-o" or arg == "--offline": + elif arg in ("-o", "--offline"): offline = 1 elif arg.startswith("-"): - print("Unknown option:", arg, file=sys.stderr) sys.exit(1) else: config_file = arg @@ -77,28 +67,23 @@ def main(): config = configparser() config.read(config_file) if not config.has_section("Planet"): - print("Configuration missing [Planet] section.", file=sys.stderr) sys.exit(1) # Read the [Planet] config section - planet_name = config_get(config, "Planet", "name", PLANET_NAME) - planet_link = config_get(config, "Planet", "link", PLANET_LINK) - planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) - owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) + planet_name = config_get(config, "Planet", "name", PLANET_NAME) + planet_link = config_get(config, "Planet", "link", PLANET_LINK) + planet_feed = config_get(config, "Planet", "feed", PLANET_FEED) + owner_name = config_get(config, "Planet", "owner_name", OWNER_NAME) owner_email = config_get(config, "Planet", "owner_email", OWNER_EMAIL) - if verbose: - log_level = "DEBUG" - else: - log_level = config_get(config, "Planet", "log_level", LOG_LEVEL) - feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) - template_files = config_get(config, "Planet", "template_files", - TEMPLATE_FILES).split(" ") + log_level = "DEBUG" if verbose else config_get(config, "Planet", "log_level", LOG_LEVEL) + feed_timeout = config_get(config, "Planet", "feed_timeout", FEED_TIMEOUT) + template_files = config_get(config, "Planet", "template_files", TEMPLATE_FILES).split(" ") # Default feed to the first feed for which there is a template if not planet_feed: for template_file in template_files: name = os.path.splitext(os.path.basename(template_file))[0] - if name.find('atom')>=0 or name.find('rss')>=0: + if name.find("atom") >= 0 or name.find("rss") >= 0: planet_feed = urljoin(planet_link, name) break @@ -107,7 +92,7 @@ def main(): # The user can specify more than one locale (separated by ":") as # fallbacks. locale_ok = False - for user_locale in config.get("Planet", "locale").split(':'): + for user_locale in config.get("Planet", "locale").split(":"): user_locale = user_locale.strip() try: locale.setlocale(locale.LC_ALL, user_locale) @@ -117,7 +102,6 @@ def main(): locale_ok = True break if not locale_ok: - print("Unsupported locale setting.", file=sys.stderr) sys.exit(1) # Activate logging @@ -144,10 +128,8 @@ def main(): my_planet = planet.Planet(config) my_planet.run(planet_name, planet_link, template_files, offline) - my_planet.generate_all_files(template_files, planet_name, - planet_link, planet_feed, owner_name, owner_email) + my_planet.generate_all_files(template_files, planet_name, planet_link, planet_feed, owner_name, owner_email) if __name__ == "__main__": main() - diff --git a/code/planet/__init__.py b/code/planet/__init__.py index ed3fe78..24a0797 100644 --- a/code/planet/__init__.py +++ b/code/planet/__init__.py @@ -7,41 +7,43 @@ """ __version__ = "2.0" -__authors__ = [ "Scott James Remnant ", - "Jeff Waugh " ] +__authors__ = ["Scott James Remnant ", "Jeff Waugh "] __license__ = "Python" # Modules available without separate import import cache import feedparser -import sanitize import htmltmpl +import sanitize + try: import logging except: import compat_logging as logging # Limit the effect of "from planet import *" -__all__ = ("cache", "feedparser", "htmltmpl", "logging", - "Planet", "Channel", "NewsItem") +__all__ = ("cache", "feedparser", "htmltmpl", "logging", "Planet", "Channel", "NewsItem") -from html.parser import HTMLParser -import os -from hashlib import md5 -import time +import contextlib import dbm +import os import re +import time +from hashlib import md5 +from html.parser import HTMLParser -try: +try: from xml.sax.saxutils import escape except: + def escape(data): - return data.replace("&","&").replace(">",">").replace("<","<") + return data.replace("&", "&").replace(">", ">").replace("<", "<") + # Version information (for generator headers) -VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__) +VERSION = f"Planet/{__version__} +http://www.planetplanet.org" # Default User-Agent header to send when retreiving feeds USER_AGENT = VERSION + " " + feedparser.USER_AGENT @@ -65,31 +67,33 @@ def escape(data): log.warning = log.warn # Defaults for the template file config sections -ENCODING = "utf-8" -ITEMS_PER_PAGE = 60 -DAYS_PER_PAGE = 0 -OUTPUT_DIR = "output" -DATE_FORMAT = "%B %d, %Y %I:%M %p" +ENCODING = "utf-8" +ITEMS_PER_PAGE = 60 +DAYS_PER_PAGE = 0 +OUTPUT_DIR = "output" +DATE_FORMAT = "%B %d, %Y %I:%M %p" NEW_DATE_FORMAT = "%B %d, %Y" ACTIVITY_THRESHOLD = 0 class stripHtml(HTMLParser): - "remove all tags from the data" - def __init__(self): + """remove all tags from the data.""" + + def __init__(self) -> None: super().__init__() self.result = [] - def handle_data(self, data): + def handle_data(self, data) -> None: self.result.append(data) def get_data(self): return "".join(self.result) + def template_info(item, date_format): """Produce a dictionary of template information.""" info = {} - for key in item.keys(): + for key in item: if item.key_type(key) == item.DATE: date = item.get_as_date(key) info[key] = time.strftime(date_format, date) @@ -97,8 +101,8 @@ def template_info(item, date_format): info[key + "_822"] = time.strftime(TIMEFMT_822, date) else: info[key] = item[key] - if 'title' in item.keys(): - info['title_plain'] = stripHtml(info['title']).result + if "title" in item: + info["title_plain"] = stripHtml(info["title"]).result return info @@ -116,7 +120,8 @@ class Planet: filter A regular expression that articles must match. exclude A regular expression that articles must not match. """ - def __init__(self, config): + + def __init__(self, config) -> None: self.config = config self._channels = [] @@ -137,18 +142,11 @@ def tmpl_config_get(self, template, option, default=None, raw=0, vars=None): return default def gather_channel_info(self, template_file="Planet"): - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) + date_format = self.tmpl_config_get(template_file, "date_format", DATE_FORMAT, raw=1) - activity_threshold = int(self.tmpl_config_get(template_file, - "activity_threshold", - ACTIVITY_THRESHOLD)) + activity_threshold = int(self.tmpl_config_get(template_file, "activity_threshold", ACTIVITY_THRESHOLD)) - if activity_threshold: - activity_horizon = \ - time.gmtime(time.time()-86400*activity_threshold) - else: - activity_horizon = 0 + activity_horizon = time.gmtime(time.time() - 86400 * activity_threshold) if activity_threshold else 0 channels = {} channels_list = [] @@ -159,25 +157,25 @@ def gather_channel_info(self, template_file="Planet"): # identify inactive feeds if activity_horizon: latest = channel.items(sorted=1) - if len(latest)==0 or latest[0].date < activity_horizon: - channels[channel]["message"] = \ - "no activity in %d days" % activity_threshold + if len(latest) == 0 or latest[0].date < activity_horizon: + channels[channel]["message"] = "no activity in %d days" % activity_threshold # report channel level errors - if not channel.url_status: continue + if not channel.url_status: + continue status = int(channel.url_status) if status == 403: - channels[channel]["message"] = "403: forbidden" + channels[channel]["message"] = "403: forbidden" elif status == 404: - channels[channel]["message"] = "404: not found" + channels[channel]["message"] = "404: not found" elif status == 408: - channels[channel]["message"] = "408: request timeout" + channels[channel]["message"] = "408: request timeout" elif status == 410: - channels[channel]["message"] = "410: gone" + channels[channel]["message"] = "410: gone" elif status == 500: - channels[channel]["message"] = "internal server error" + channels[channel]["message"] = "internal server error" elif status >= 400: - channels[channel]["message"] = "http status %s" % status + channels[channel]["message"] = f"http status {status}" return channels, channels_list @@ -186,40 +184,32 @@ def gather_items_info(self, channels, template_file="Planet", channel_list=None) prev_date = [] prev_channel = None - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) - items_per_page = int(self.tmpl_config_get(template_file, - "items_per_page", ITEMS_PER_PAGE)) - days_per_page = int(self.tmpl_config_get(template_file, - "days_per_page", DAYS_PER_PAGE)) - new_date_format = self.tmpl_config_get(template_file, - "new_date_format", NEW_DATE_FORMAT, raw=1) - - for newsitem in self.items(max_items=items_per_page, - max_days=days_per_page, - channels=channel_list): + date_format = self.tmpl_config_get(template_file, "date_format", DATE_FORMAT, raw=1) + items_per_page = int(self.tmpl_config_get(template_file, "items_per_page", ITEMS_PER_PAGE)) + days_per_page = int(self.tmpl_config_get(template_file, "days_per_page", DAYS_PER_PAGE)) + new_date_format = self.tmpl_config_get(template_file, "new_date_format", NEW_DATE_FORMAT, raw=1) + + for newsitem in self.items(max_items=items_per_page, max_days=days_per_page, channels=channel_list): item_info = template_info(newsitem, date_format) chan_info = channels[newsitem._channel] for k, v in chan_info.items(): item_info["channel_" + k] = v - + # Check for the start of a new day if prev_date[:3] != newsitem.date[:3]: prev_date = newsitem.date - item_info["new_date"] = time.strftime(new_date_format, - newsitem.date) - + item_info["new_date"] = time.strftime(new_date_format, newsitem.date) + # Check for the start of a new channel - if item_info.has_key("new_date") \ - or prev_channel != newsitem._channel: + if item_info.has_key("new_date") or prev_channel != newsitem._channel: prev_channel = newsitem._channel item_info["new_channel"] = newsitem._channel.url - + items_list.append(item_info) return items_list - def run(self, planet_name, planet_link, template_files, offline = False): + def run(self, planet_name, planet_link, template_files, offline=False) -> None: log = logging.getLogger("planet.runner") # Create a planet @@ -227,9 +217,8 @@ def run(self, planet_name, planet_link, template_files, offline = False): if self.config.has_option("Planet", "cache_directory"): self.cache_directory = self.config.get("Planet", "cache_directory") if self.config.has_option("Planet", "new_feed_items"): - self.new_feed_items = int(self.config.get("Planet", "new_feed_items")) - self.user_agent = "%s +%s %s" % (planet_name, planet_link, - self.user_agent) + self.new_feed_items = int(self.config.get("Planet", "new_feed_items")) + self.user_agent = f"{planet_name} +{planet_link} {self.user_agent}" if self.config.has_option("Planet", "filter"): self.filter = self.config.get("Planet", "filter") @@ -244,16 +233,16 @@ def run(self, planet_name, planet_link, template_files, offline = False): # Update it try: - if not offline and not channel.url_status == '410': + if not offline and channel.url_status != "410": channel.update() except KeyboardInterrupt: raise except: log.exception("Update of <%s> failed", feed_url) - def generate_all_files(self, template_files, planet_name, - planet_link, planet_feed, owner_name, owner_email): - + def generate_all_files( + self, template_files, planet_name, planet_link, planet_feed, owner_name, owner_email + ) -> None: log = logging.getLogger("planet.runner") # Go-go-gadget-template for template_file in template_files: @@ -264,45 +253,43 @@ def generate_all_files(self, template_files, planet_name, except htmltmpl.TemplateError: template = manager.prepare(os.path.basename(template_file)) # Read the configuration - output_dir = self.tmpl_config_get(template_file, - "output_dir", OUTPUT_DIR) - date_format = self.tmpl_config_get(template_file, - "date_format", DATE_FORMAT, raw=1) + output_dir = self.tmpl_config_get(template_file, "output_dir", OUTPUT_DIR) + date_format = self.tmpl_config_get(template_file, "date_format", DATE_FORMAT, raw=1) encoding = self.tmpl_config_get(template_file, "encoding", ENCODING) - + # We treat each template individually base = os.path.splitext(os.path.basename(template_file))[0] url = os.path.join(planet_link, base) output_file = os.path.join(output_dir, base) # Gather information - channels, channels_list = self.gather_channel_info(template_file) - items_list = self.gather_items_info(channels, template_file) + channels, channels_list = self.gather_channel_info(template_file) + items_list = self.gather_items_info(channels, template_file) # Gather item information - + # Process the template tp = htmltmpl.TemplateProcessor(html_escape=0) tp.set("Items", items_list) tp.set("Channels", channels_list) - + # Generic information - tp.set("generator", VERSION) - tp.set("name", planet_name) - tp.set("link", planet_link) - tp.set("owner_name", owner_name) + tp.set("generator", VERSION) + tp.set("name", planet_name) + tp.set("link", planet_link) + tp.set("owner_name", owner_name) tp.set("owner_email", owner_email) - tp.set("url", url) - + tp.set("url", url) + if planet_feed: tp.set("feed", planet_feed) - tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom') - + tp.set("feedtype", planet_feed.find("rss") >= 0 and "rss" or "atom") + # Update time date = time.gmtime() - tp.set("date", time.strftime(date_format, date)) - tp.set("date_iso", time.strftime(TIMEFMT_ISO, date)) - tp.set("date_822", time.strftime(TIMEFMT_822, date)) + tp.set("date", time.strftime(date_format, date)) + tp.set("date_iso", time.strftime(TIMEFMT_ISO, date)) + tp.set("date_822", time.strftime(TIMEFMT_822, date)) try: log.info("Writing %s", output_file) @@ -334,17 +321,19 @@ def channels(self, hidden=0, sorted=1): if sorted: channels.sort() - return [ c[-1] for c in channels ] + return [c[-1] for c in channels] def find_by_basename(self, basename): for channel in self._channels: - if basename == channel.cache_basename(): return channel + if basename == channel.cache_basename(): + return channel + return None - def subscribe(self, channel): + def subscribe(self, channel) -> None: """Subscribe the planet to the channel.""" self._channels.append(channel) - def unsubscribe(self, channel): + def unsubscribe(self, channel) -> None: """Unsubscribe the planet from the channel.""" self._channels.remove(channel) @@ -374,55 +363,44 @@ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): """ planet_filter_re = None if self.filter: - planet_filter_re = re.compile(self.filter, re.I) + planet_filter_re = re.compile(self.filter, re.IGNORECASE) planet_exclude_re = None if self.exclude: - planet_exclude_re = re.compile(self.exclude, re.I) - + planet_exclude_re = re.compile(self.exclude, re.IGNORECASE) + items = [] seen_guids = {} - if not channels: channels=self.channels(hidden=hidden, sorted=0) + if not channels: + channels = self.channels(hidden=hidden, sorted=0) for channel in channels: for item in channel._items.values(): if hidden or not item.has_key("hidden"): - channel_filter_re = None if channel.filter: - channel_filter_re = re.compile(channel.filter, - re.I) + channel_filter_re = re.compile(channel.filter, re.IGNORECASE) channel_exclude_re = None if channel.exclude: - channel_exclude_re = re.compile(channel.exclude, - re.I) - if (planet_filter_re or planet_exclude_re \ - or channel_filter_re or channel_exclude_re): + channel_exclude_re = re.compile(channel.exclude, re.IGNORECASE) + if planet_filter_re or planet_exclude_re or channel_filter_re or channel_exclude_re: title = "" if item.has_key("title"): title = item.title content = item.get_content("content") - if planet_filter_re: - if not (planet_filter_re.search(title) \ - or planet_filter_re.search(content)): - continue + if planet_filter_re and not (planet_filter_re.search(title) or planet_filter_re.search(content)): + continue - if planet_exclude_re: - if (planet_exclude_re.search(title) \ - or planet_exclude_re.search(content)): - continue + if planet_exclude_re and (planet_exclude_re.search(title) or planet_exclude_re.search(content)): + continue - if channel_filter_re: - if not (channel_filter_re.search(title) \ - or channel_filter_re.search(content)): - continue + if channel_filter_re and not (channel_filter_re.search(title) or channel_filter_re.search(content)): + continue - if channel_exclude_re: - if (channel_exclude_re.search(title) \ - or channel_exclude_re.search(content)): - continue + if channel_exclude_re and (channel_exclude_re.search(title) or channel_exclude_re.search(content)): + continue if not seen_guids.has_key(item.id): - seen_guids[item.id] = 1; + seen_guids[item.id] = 1 items.append((time.mktime(item.date), item.order, item)) # Sort the list @@ -445,7 +423,8 @@ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): items = items[:max_count] break - return [ i[-1] for i in items ] + return [i[-1] for i in items] + class Channel(cache.CachedInfo): """A list of news items. @@ -499,10 +478,22 @@ class Channel(cache.CachedInfo): Some feeds may define additional properties to those above. """ - IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories", - "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit") - def __init__(self, planet, url): + IGNORE_KEYS = ( + "links", + "contributors", + "textinput", + "cloud", + "categories", + "url", + "href", + "url_etag", + "url_modified", + "tags", + "itunes_explicit", + ) + + def __init__(self, planet, url) -> None: if not os.path.isdir(planet.cache_directory): os.makedirs(planet.cache_directory) cache_filename = cache.filename(planet.cache_directory, url) @@ -555,26 +546,28 @@ def items(self, hidden=0, sorted=0): items.sort() items.reverse() - return [ i[-1] for i in items ] + return [i[-1] for i in items] def __iter__(self): """Iterate the sorted item list.""" return iter(self.items(sorted=1)) - def cache_read_entries(self): + def cache_read_entries(self) -> None: """Read entry information from the cache.""" keys = self._cache.keys() for key in keys: - if key.find(" ") != -1: continue - if self.has_key(key): continue + if key.find(" ") != -1: + continue + if self.has_key(key): + continue item = NewsItem(self, key) self._items[key] = item def cache_basename(self): - return cache.filename('',self._id) + return cache.filename("", self._id) - def cache_write(self, sync=1): + def cache_write(self, sync=1) -> None: """Write channel and item information to the cache.""" for item in self._items.values(): item.cache_write(sync=0) @@ -584,9 +577,8 @@ def cache_write(self, sync=1): self._expired = [] - def feed_information(self): - """ - Returns a description string for the feed embedded in this channel. + def feed_information(self) -> str: + """Returns a description string for the feed embedded in this channel. This will usually simply be the feed url embedded in <>, but in the case where the current self.url has changed from the original @@ -596,50 +588,46 @@ def feed_information(self): file is annoying. """ if self.url == self.configured_url: - return "<%s>" % self.url + return f"<{self.url}>" else: - return "<%s> (formerly <%s>)" % (self.url, self.configured_url) + return f"<{self.url}> (formerly <{self.configured_url}>)" - def update(self): + def update(self) -> None: """Download the feed to refresh the information. This does the actual work of pulling down the feed and if it changes updates the cached information about the feed and entries within it. """ - info = feedparser.parse(self.url, - etag=self.url_etag, modified=self.url_modified, - agent=self._planet.user_agent) + info = feedparser.parse(self.url, etag=self.url_etag, modified=self.url_modified, agent=self._planet.user_agent) if info.has_key("status"): - self.url_status = str(info.status) - elif info.has_key("entries") and len(info.entries)>0: - self.url_status = str(200) - elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout': - self.url_status = str(408) + self.url_status = str(info.status) + elif info.has_key("entries") and len(info.entries) > 0: + self.url_status = str(200) + elif info.bozo and info.bozo_exception.__class__.__name__ == "Timeout": + self.url_status = str(408) else: - self.url_status = str(500) + self.url_status = str(500) - if self.url_status == '301' and \ - (info.has_key("entries") and len(info.entries)>0): + if self.url_status == "301" and (info.has_key("entries") and len(info.entries) > 0): log.warning("Feed has moved from <%s> to <%s>", self.url, info.url) - try: - os.link(cache.filename(self._planet.cache_directory, self.url), - cache.filename(self._planet.cache_directory, info.url)) - except: - pass + with contextlib.suppress(Exception): + os.link( + cache.filename(self._planet.cache_directory, self.url), + cache.filename(self._planet.cache_directory, info.url), + ) self.url = info.url - elif self.url_status == '304': + elif self.url_status == "304": log.info("Feed %s unchanged", self.feed_information()) return - elif self.url_status == '410': + elif self.url_status == "410": log.info("Feed %s gone", self.feed_information()) self.cache_write() return - elif self.url_status == '408': + elif self.url_status == "408": log.warning("Feed %s timed out", self.feed_information()) return elif int(self.url_status) >= 400: - log.error("Error %s while updating feed %s", - self.url_status, self.feed_information()) + log.error("Error %s while updating feed %s", self.url_status, self.feed_information()) return else: log.info("Updating feed %s", self.feed_information()) @@ -649,21 +637,20 @@ def update(self): if self.url_etag is not None: log.debug("E-Tag: %s", self.url_etag) if self.url_modified is not None: - log.debug("Last Modified: %s", - time.strftime(TIMEFMT_ISO, self.url_modified)) + log.debug("Last Modified: %s", time.strftime(TIMEFMT_ISO, self.url_modified)) self.update_info(info.feed) self.update_entries(info.entries) self.cache_write() - def update_info(self, feed): + def update_info(self, feed) -> None: """Update information from the feed. This reads the feed information supplied by feedparser and updates the cached information about the feed. These are the various potentially interesting properties that you might care about. """ - for key in feed.keys(): + for key in feed: if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass @@ -672,19 +659,17 @@ def update_info(self, feed): pass elif key.endswith("_detail"): # retain name and email sub-fields - if feed[key].has_key('name') and feed[key].name: - self.set_as_string(key.replace("_detail","_name"), \ - feed[key].name) - if feed[key].has_key('email') and feed[key].email: - self.set_as_string(key.replace("_detail","_email"), \ - feed[key].email) + if feed[key].has_key("name") and feed[key].name: + self.set_as_string(key.replace("_detail", "_name"), feed[key].name) + if feed[key].has_key("email") and feed[key].email: + self.set_as_string(key.replace("_detail", "_email"), feed[key].email) elif key == "items": # Ignore items field pass elif key.endswith("_parsed"): # Date fields if feed[key] is not None: - self.set_as_date(key[:-len("_parsed")], feed[key]) + self.set_as_date(key[: -len("_parsed")], feed[key]) elif key == "image": # Image field: save all the information if feed[key].has_key("url"): @@ -700,20 +685,19 @@ def update_info(self, feed): elif isinstance(feed[key], str): # String fields try: - detail = key + '_detail' - if feed.has_key(detail) and feed[detail].has_key('type'): - if feed[detail].type == 'text/html': + detail = key + "_detail" + if feed.has_key(detail) and feed[detail].has_key("type"): + if feed[detail].type == "text/html": feed[key] = sanitize.HTML(feed[key]) - elif feed[detail].type == 'text/plain': + elif feed[detail].type == "text/plain": feed[key] = escape(feed[key]) self.set_as_string(key, feed[key]) except KeyboardInterrupt: raise except: - log.exception("Ignored '%s' of <%s>, unknown format", - key, self.url) + log.exception("Ignored '%s' of <%s>, unknown format", key, self.url) - def update_entries(self, entries): + def update_entries(self, entries) -> None: """Update entries from the feed. This reads the entries supplied by feedparser and updates the @@ -743,11 +727,9 @@ def update_entries(self, entries): elif entry.has_key("link"): entry_id = cache.utf8(entry.link) elif entry.has_key("title"): - entry_id = (self.url + "/" - + md5.new(cache.utf8(entry.title)).hexdigest()) + entry_id = self.url + "/" + md5.new(cache.utf8(entry.title)).hexdigest() elif entry.has_key("summary"): - entry_id = (self.url + "/" - + md5.new(cache.utf8(entry.summary)).hexdigest()) + entry_id = self.url + "/" + md5.new(cache.utf8(entry.summary)).hexdigest() else: log.error("Unable to find or generate id, entry ignored") continue @@ -763,8 +745,11 @@ def update_entries(self, entries): feed_items.append(entry_id) # Hide excess items the first time through - if self.last_updated is None and self._planet.new_feed_items \ - and len(feed_items) > self._planet.new_feed_items: + if ( + self.last_updated is None + and self._planet.new_feed_items + and len(feed_items) > self._planet.new_feed_items + ): item.hidden = "yes" log.debug("Marked <%s> as hidden (new feed)", entry_id) @@ -781,8 +766,8 @@ def update_entries(self, entries): break elif item.id in feed_items: feed_count -= 1 - elif item._channel.url_status != '226': - del(self._items[item.id]) + elif item._channel.url_status != "226": + del self._items[item.id] self._expired.append(item) log.debug("Removed expired or replaced item <%s>", item.id) @@ -794,6 +779,7 @@ def get_name(self, key): return "" + class NewsItem(cache.CachedInfo): """An item of news. @@ -832,10 +818,10 @@ class NewsItem(cache.CachedInfo): Some feeds may define additional properties to those above. """ - IGNORE_KEYS = ("categories", "contributors", "enclosures", "links", - "guidislink", "date", "tags") - def __init__(self, channel, id_): + IGNORE_KEYS = ("categories", "contributors", "enclosures", "links", "guidislink", "date", "tags") + + def __init__(self, channel, id_) -> None: cache.CachedInfo.__init__(self, channel._cache, id_) self._channel = channel @@ -846,9 +832,9 @@ def __init__(self, channel, id_): self.content = None self.cache_read() - def update(self, entry): + def update(self, entry) -> None: """Update the item from the feedparser entry given.""" - for key in entry.keys(): + for key in entry: if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: # Ignored fields pass @@ -857,21 +843,20 @@ def update(self, entry): pass elif key.endswith("_detail"): # retain name, email, and language sub-fields - if entry[key].has_key('name') and entry[key].name: - self.set_as_string(key.replace("_detail","_name"), \ - entry[key].name) - if entry[key].has_key('email') and entry[key].email: - self.set_as_string(key.replace("_detail","_email"), \ - entry[key].email) - if entry[key].has_key('language') and entry[key].language and \ - (not self._channel.has_key('language') or \ - entry[key].language != self._channel.language): - self.set_as_string(key.replace("_detail","_language"), \ - entry[key].language) + if entry[key].has_key("name") and entry[key].name: + self.set_as_string(key.replace("_detail", "_name"), entry[key].name) + if entry[key].has_key("email") and entry[key].email: + self.set_as_string(key.replace("_detail", "_email"), entry[key].email) + if ( + entry[key].has_key("language") + and entry[key].language + and (not self._channel.has_key("language") or entry[key].language != self._channel.language) + ): + self.set_as_string(key.replace("_detail", "_language"), entry[key].language) elif key.endswith("_parsed"): # Date fields if entry[key] is not None: - self.set_as_date(key[:-len("_parsed")], entry[key]) + self.set_as_date(key[: -len("_parsed")], entry[key]) elif key == "source": # Source field: save both url and value if entry[key].has_key("value"): @@ -882,32 +867,32 @@ def update(self, entry): # Content field: concatenate the values value = "" for item in entry[key]: - if item.type == 'text/html': + if item.type == "text/html": item.value = sanitize.HTML(item.value) - elif item.type == 'text/plain': + elif item.type == "text/plain": item.value = escape(item.value) - if item.has_key('language') and item.language and \ - (not self._channel.has_key('language') or - item.language != self._channel.language) : + if ( + item.has_key("language") + and item.language + and (not self._channel.has_key("language") or item.language != self._channel.language) + ): self.set_as_string(key + "_language", item.language) value += cache.utf8(item.value) self.set_as_string(key, value) elif isinstance(entry[key], str): # String fields try: - detail = key + '_detail' - if entry.has_key(detail): - if entry[detail].has_key('type'): - if entry[detail].type == 'text/html': - entry[key] = sanitize.HTML(entry[key]) - elif entry[detail].type == 'text/plain': - entry[key] = escape(entry[key]) + detail = key + "_detail" + if entry.has_key(detail) and entry[detail].has_key("type"): + if entry[detail].type == "text/html": + entry[key] = sanitize.HTML(entry[key]) + elif entry[detail].type == "text/plain": + entry[key] = escape(entry[key]) self.set_as_string(key, entry[key]) except KeyboardInterrupt: raise except: - log.exception("Ignored '%s' of <%s>, unknown format", - key, self.id) + log.exception("Ignored '%s' of <%s>, unknown format", key, self.id) # Generate the date field if we need to self.get_date("date") @@ -925,7 +910,6 @@ def get_date(self, key): entries appear in posting sequence but don't overlap entries added in previous updates and don't creep into the next one. """ - for other_key in ("updated", "modified", "published", "issued", "created"): if self.has_key(other_key): date = self.get_as_date(other_key) @@ -936,8 +920,8 @@ def get_date(self, key): if date is not None: if date > self._channel.updated: date = self._channel.updated -# elif date < self._channel.last_updated: -# date = self._channel.updated + # elif date < self._channel.last_updated: + # date = self._channel.updated elif self.has_key(key) and self.key_type(key) != self.NULL: return self.get_as_date(key) else: diff --git a/code/planet/atomstyler.py b/code/planet/atomstyler.py index 645cc40..19d6c5c 100644 --- a/code/planet/atomstyler.py +++ b/code/planet/atomstyler.py @@ -1,124 +1,137 @@ -from xml.dom import minidom, Node +import re +from html.entities import name2codepoint from urllib.parse import urlparse, urlunparse +from xml.dom import Node, minidom from xml.parsers.expat import ExpatError -from html.entities import name2codepoint -import re + # select and apply an xml:base for this entry class relativize: - def __init__(self, parent): - self.score = {} - self.links = [] - self.collect_and_tally(parent) - self.base = self.select_optimal_base() - if self.base: - if not parent.hasAttribute('xml:base'): - self.rebase(parent) - parent.setAttribute('xml:base', self.base) - - # collect and tally cite, href and src attributes - def collect_and_tally(self,parent): - uri = None - if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') - if parent.hasAttribute('href'): uri=parent.getAttribute('href') - if parent.hasAttribute('src'): uri=parent.getAttribute('src') - - if uri: - parts=urlparse(uri) - if parts[0].lower() == 'http': - parts = (parts[1]+parts[2]).split('/') - base = None - for i in range(1,len(parts)): - base = tuple(parts[0:i]) - self.score[base] = self.score.get(base,0) + len(base) - if base and base not in self.links: self.links.append(base) + def __init__(self, parent) -> None: + self.score = {} + self.links = [] + self.collect_and_tally(parent) + self.base = self.select_optimal_base() + if self.base and not parent.hasAttribute("xml:base"): + self.rebase(parent) + parent.setAttribute("xml:base", self.base) + + # collect and tally cite, href and src attributes + def collect_and_tally(self, parent) -> None: + uri = None + if parent.hasAttribute("cite"): + uri = parent.getAttribute("cite") + if parent.hasAttribute("href"): + uri = parent.getAttribute("href") + if parent.hasAttribute("src"): + uri = parent.getAttribute("src") + + if uri: + parts = urlparse(uri) + if parts[0].lower() == "http": + parts = (parts[1] + parts[2]).split("/") + base = None + for i in range(1, len(parts)): + base = tuple(parts[0:i]) + self.score[base] = self.score.get(base, 0) + len(base) + if base and base not in self.links: + self.links.append(base) + + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + self.collect_and_tally(node) + + # select the xml:base with the highest score + def select_optimal_base(self): + if not self.score: + return None + for link in self.links: + self.score[link] = 0 + winner = max(self.score.values()) + if not winner: + return None + for key in self.score: + if self.score[key] == winner: + if winner == len(key): + return None + return urlunparse(("http", key[0], "/".join(key[1:]), "", "", "")) + "/" + return None + + # rewrite cite, href and src attributes using this base + def rebase(self, parent) -> None: + uri = None + if parent.hasAttribute("cite"): + uri = parent.getAttribute("cite") + if parent.hasAttribute("href"): + uri = parent.getAttribute("href") + if parent.hasAttribute("src"): + uri = parent.getAttribute("src") + if uri and uri.startswith(self.base): + uri = uri[len(self.base) :] or "." + if parent.hasAttribute("href"): + uri = parent.setAttribute("href", uri) + if parent.hasAttribute("src"): + uri = parent.setAttribute("src", uri) + + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + self.rebase(node) - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - self.collect_and_tally(node) - - # select the xml:base with the highest score - def select_optimal_base(self): - if not self.score: return None - for link in self.links: - self.score[link] = 0 - winner = max(self.score.values()) - if not winner: return None - for key in self.score.keys(): - if self.score[key] == winner: - if winner == len(key): return None - return urlunparse(('http', key[0], '/'.join(key[1:]), '', '', '')) + '/' - - # rewrite cite, href and src attributes using this base - def rebase(self,parent): - uri = None - if parent.hasAttribute('cite'): uri=parent.getAttribute('cite') - if parent.hasAttribute('href'): uri=parent.getAttribute('href') - if parent.hasAttribute('src'): uri=parent.getAttribute('src') - if uri and uri.startswith(self.base): - uri = uri[len(self.base):] or '.' - if parent.hasAttribute('href'): uri=parent.setAttribute('href', uri) - if parent.hasAttribute('src'): uri=parent.setAttribute('src', uri) - - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - self.rebase(node) # convert type="html" to type="plain" or type="xhtml" as appropriate -def retype(parent): - for node in parent.childNodes: - if node.nodeType == Node.ELEMENT_NODE: - - if node.hasAttribute('type') and node.getAttribute('type') == 'html': - if len(node.childNodes)==0: - node.removeAttribute('type') - elif len(node.childNodes)==1: - - # replace html entity defs with utf-8 - chunks=re.split(r'&(\w+);', node.childNodes[0].nodeValue) - for i in range(1,len(chunks),2): - if chunks[i] in ['amp', 'lt', 'gt', 'apos', 'quot']: - chunks[i] ='&' + chunks[i] +';' - elif chunks[i] in name2codepoint: - chunks[i] = chr(name2codepoint[chunks[i]]) - else: - chunks[i]='&' + chunks[i] + ';' - text = "".join(chunks) - - try: - # see if the resulting text is a well-formed XML fragment - div = '
%s
' - data = minidom.parseString(div % text.encode('utf-8')) - - if text.find('<') < 0: - # plain text - node.removeAttribute('type') - text = data.documentElement.childNodes[0].nodeValue - node.childNodes[0].replaceWholeText(text) - - elif len(text) > 80: - # xhtml - node.setAttribute('type', 'xhtml') - node.removeChild(node.childNodes[0]) - node.appendChild(data.documentElement) - - except ExpatError: - # leave as html - pass - - else: - # recurse - retype(node) - - if parent.nodeName == 'entry': - relativize(parent) - -if __name__ == '__main__': - - # run styler on each file mention on the command line - import sys - for feed in sys.argv[1:]: - doc = minidom.parse(feed) - doc.normalize() - retype(doc.documentElement) - open(feed,'w').write(doc.toxml('utf-8')) +def retype(parent) -> None: + for node in parent.childNodes: + if node.nodeType == Node.ELEMENT_NODE: + if node.hasAttribute("type") and node.getAttribute("type") == "html": + if len(node.childNodes) == 0: + node.removeAttribute("type") + elif len(node.childNodes) == 1: + # replace html entity defs with utf-8 + chunks = re.split(r"&(\w+);", node.childNodes[0].nodeValue) + for i in range(1, len(chunks), 2): + if chunks[i] in ["amp", "lt", "gt", "apos", "quot"]: + chunks[i] = "&" + chunks[i] + ";" + elif chunks[i] in name2codepoint: + chunks[i] = chr(name2codepoint[chunks[i]]) + else: + chunks[i] = "&" + chunks[i] + ";" + text = "".join(chunks) + + try: + # see if the resulting text is a well-formed XML fragment + div = '
%s
' + data = minidom.parseString(div % text.encode("utf-8")) + + if text.find("<") < 0: + # plain text + node.removeAttribute("type") + text = data.documentElement.childNodes[0].nodeValue + node.childNodes[0].replaceWholeText(text) + + elif len(text) > 80: + # xhtml + node.setAttribute("type", "xhtml") + node.removeChild(node.childNodes[0]) + node.appendChild(data.documentElement) + + except ExpatError: + # leave as html + pass + + else: + # recurse + retype(node) + + if parent.nodeName == "entry": + relativize(parent) + + +if __name__ == "__main__": + # run styler on each file mention on the command line + import sys + + for feed in sys.argv[1:]: + doc = minidom.parse(feed) + doc.normalize() + retype(doc.documentElement) + open(feed, "w").write(doc.toxml("utf-8")) diff --git a/code/planet/cache.py b/code/planet/cache.py index 8472742..be2f50d 100644 --- a/code/planet/cache.py +++ b/code/planet/cache.py @@ -12,12 +12,11 @@ import os import re - # Regular expressions to sanitise cache filenames -re_url_scheme = re.compile(r'^[^:]*://') -re_slash = re.compile(r'[?/]+') -re_initial_cruft = re.compile(r'^[,.]*') -re_final_cruft = re.compile(r'[,.]*$') +re_url_scheme = re.compile(r"^[^:]*://") +re_slash = re.compile(r"[?/]+") +re_initial_cruft = re.compile(r"^[,.]*") +re_final_cruft = re.compile(r"[,.]*$") class CachedInfo: @@ -32,11 +31,12 @@ class CachedInfo: and implement get_FIELD and set_FIELD functions which will be automatically called. """ + STRING = "string" - DATE = "date" - NULL = "null" + DATE = "date" + NULL = "null" - def __init__(self, cache, id_, root=0): + def __init__(self, cache, id_, root=0) -> None: self._type = {} self._value = {} self._cached = {} @@ -53,12 +53,9 @@ def cache_key(self, key): else: return self._id + " " + key - def cache_read(self): + def cache_read(self) -> None: """Read information from the cache.""" - if self._root: - keys_key = " keys" - else: - keys_key = self._id + keys_key = " keys" if self._root else self._id if self._cache.has_key(keys_key): keys = self._cache[keys_key].split(" ") @@ -73,7 +70,7 @@ def cache_read(self): self._type[key] = self._cache[cache_key + " type"] self._cached[key] = 1 - def cache_write(self, sync=1): + def cache_write(self, sync=1) -> None: """Write information to the cache.""" self.cache_clear(sync=0) @@ -83,40 +80,34 @@ def cache_write(self, sync=1): if not self._cached[key]: if self._cache.has_key(cache_key): # Non-cached keys need to be cleared - del(self._cache[cache_key]) - del(self._cache[cache_key + " type"]) + del self._cache[cache_key] + del self._cache[cache_key + " type"] continue keys.append(key) self._cache[cache_key] = self._value[key] self._cache[cache_key + " type"] = self._type[key] - if self._root: - keys_key = " keys" - else: - keys_key = self._id + keys_key = " keys" if self._root else self._id self._cache[keys_key] = " ".join(keys) if sync: self._cache.sync() - def cache_clear(self, sync=1): + def cache_clear(self, sync=1) -> None: """Remove information from the cache.""" - if self._root: - keys_key = " keys" - else: - keys_key = self._id + keys_key = " keys" if self._root else self._id if self._cache.has_key(keys_key): keys = self._cache[keys_key].split(" ") - del(self._cache[keys_key]) + del self._cache[keys_key] else: return for key in keys: cache_key = self.cache_key(key) - del(self._cache[cache_key]) - del(self._cache[cache_key + " type"]) + del self._cache[cache_key] + del self._cache[cache_key + " type"] if sync: self._cache.sync() @@ -147,7 +138,7 @@ def set(self, key, value, cached=1): else: return func(key, value) - if value == None: + if value is None: return self.set_as_null(key, value) else: try: @@ -179,7 +170,7 @@ def get(self, key): return self._value[key] - def set_as_string(self, key, value, cached=1): + def set_as_string(self, key, value, cached=1) -> None: """Set the key to the string value. The value is converted to UTF-8 if it is a Unicode string, otherwise @@ -201,12 +192,12 @@ def get_as_string(self, key): return self._value[key] - def set_as_date(self, key, value, cached=1): + def set_as_date(self, key, value, cached=1) -> None: """Set the key to the date value. The date should be a 9-item tuple as returned by time.gmtime(). """ - value = " ".join([ str(s) for s in value ]) + value = " ".join([str(s) for s in value]) key = key.replace(" ", "_") self._value[key] = value @@ -220,9 +211,9 @@ def get_as_date(self, key): raise KeyError(key) value = self._value[key] - return tuple([ int(i) for i in value.split(" ") ]) + return tuple([int(i) for i in value.split(" ")]) - def set_as_null(self, key, value, cached=1): + def set_as_null(self, key, value, cached=1) -> None: """Set the key to the null value. This only exists to make things less magic. @@ -232,23 +223,21 @@ def set_as_null(self, key, value, cached=1): self._type[key] = self.NULL self._cached[key] = cached - def get_as_null(self, key): + def get_as_null(self, key) -> None: """Return the key as the null value.""" key = key.replace(" ", "_") if not self.has_key(key): raise KeyError(key) - return None - - def del_key(self, key): + def del_key(self, key) -> None: """Delete the given key.""" key = key.replace(" ", "_") if not self.has_key(key): raise KeyError(key) - del(self._value[key]) - del(self._type[key]) - del(self._cached[key]) + del self._value[key] + del self._type[key] + del self._cached[key] def keys(self): """Return the list of cached keys.""" @@ -260,12 +249,12 @@ def __iter__(self): # Special methods __contains__ = has_key - __setitem__ = set_as_string - __getitem__ = get - __delitem__ = del_key - __delattr__ = del_key + __setitem__ = set_as_string + __getitem__ = get + __delitem__ = del_key + __delattr__ = del_key - def __setattr__(self, key, value): + def __setattr__(self, key, value) -> None: if key.startswith("_"): self.__dict__[key] = value else: @@ -291,9 +280,10 @@ def filename(directory, filename): return os.path.join(directory, filename) + def utf8(value): """Return the value as a UTF-8 string.""" - if type(value) == type(''): + if type(value) == str: return value.encode("utf-8") else: try: diff --git a/code/planet/compat_logging/__init__.py b/code/planet/compat_logging/__init__.py index 6a751b8..a386793 100644 --- a/code/planet/compat_logging/__init__.py +++ b/code/planet/compat_logging/__init__.py @@ -14,8 +14,7 @@ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -""" -Logging package for Python. Based on PEP 282 and comments thereto in +"""Logging package for Python. Based on PEP 282 and comments thereto in comp.lang.python, and influenced by Apache's log4j system. Should work under Python versions >= 1.5.2, except that source line @@ -26,36 +25,35 @@ To use, simply 'import logging' and log away! """ -import sys +import io import os -import types -import time import string -import io +import sys +import time +import types +from typing import Never try: - import thread import threading + + import thread except ImportError: thread = None -__author__ = "Vinay Sajip " -__status__ = "beta" +__author__ = "Vinay Sajip " +__status__ = "beta" __version__ = "0.4.8.1" -__date__ = "26 June 2003" +__date__ = "26 June 2003" -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # Miscellaneous module data -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # -#_srcfile is used when walking the stack to check when we've got the first +# _srcfile is used when walking the stack to check when we've got the first # caller stack frame. # -if string.lower(__file__[-4:]) in ['.pyc', '.pyo']: - _srcfile = __file__[:-4] + '.py' -else: - _srcfile = __file__ +_srcfile = __file__[:-4] + ".py" if string.lower(__file__[-4:]) in [".pyc", ".pyo"] else __file__ _srcfile = os.path.normcase(_srcfile) # _srcfile is only used in conjunction with sys._getframe(). @@ -66,19 +64,19 @@ _srcfile = None # -#_startTime is used as the base when calculating the relative time of events +# _startTime is used as the base when calculating the relative time of events # _startTime = time.time() # -#raiseExceptions is used to see if exceptions during handling should be -#propagated +# raiseExceptions is used to see if exceptions during handling should be +# propagated # raiseExceptions = 1 -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # Level related stuff -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # # Default levels and level names, these can be replaced with any positive set # of values having corresponding names. There is a pseudo-level, NOTSET, which @@ -96,24 +94,24 @@ NOTSET = 0 _levelNames = { - CRITICAL : 'CRITICAL', - ERROR : 'ERROR', - WARNING : 'WARNING', - INFO : 'INFO', - DEBUG : 'DEBUG', - NOTSET : 'NOTSET', - 'CRITICAL' : CRITICAL, - 'ERROR' : ERROR, - 'WARN' : WARNING, - 'WARNING' : WARNING, - 'INFO' : INFO, - 'DEBUG' : DEBUG, - 'NOTSET' : NOTSET, + CRITICAL: "CRITICAL", + ERROR: "ERROR", + WARNING: "WARNING", + INFO: "INFO", + DEBUG: "DEBUG", + NOTSET: "NOTSET", + "CRITICAL": CRITICAL, + "ERROR": ERROR, + "WARN": WARNING, + "WARNING": WARNING, + "INFO": INFO, + "DEBUG": DEBUG, + "NOTSET": NOTSET, } + def getLevelName(level): - """ - Return the textual representation of logging level 'level'. + """Return the textual representation of logging level 'level'. If the level is one of the predefined levels (CRITICAL, ERROR, WARNING, INFO, DEBUG) then you get the corresponding string. If you have @@ -121,38 +119,39 @@ def getLevelName(level): associated with 'level' is returned. Otherwise, the string "Level %s" % level is returned. """ - return _levelNames.get(level, ("Level %s" % level)) + return _levelNames.get(level, (f"Level {level}")) -def addLevelName(level, levelName): - """ - Associate 'levelName' with 'level'. + +def addLevelName(level, levelName) -> None: + """Associate 'levelName' with 'level'. This is used when converting levels to text during message formatting. """ _acquireLock() - try: #unlikely to cause an exception, but you never know... + try: # unlikely to cause an exception, but you never know... _levelNames[level] = levelName _levelNames[levelName] = level finally: _releaseLock() -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Thread-related stuff -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # -#_lock is used to serialize access to shared data structures in this module. -#This needs to be an RLock because fileConfig() creates Handlers and so -#might arbitrary user threads. Since Handler.__init__() updates the shared -#dictionary _handlers, it needs to acquire the lock. But if configuring, -#the lock would already have been acquired - so we need an RLock. -#The same argument applies to Loggers and Manager.loggerDict. +# _lock is used to serialize access to shared data structures in this module. +# This needs to be an RLock because fileConfig() creates Handlers and so +# might arbitrary user threads. Since Handler.__init__() updates the shared +# dictionary _handlers, it needs to acquire the lock. But if configuring, +# the lock would already have been acquired - so we need an RLock. +# The same argument applies to Loggers and Manager.loggerDict. # _lock = None -def _acquireLock(): - """ - Acquire the module-level lock for serializing access to shared data. + +def _acquireLock() -> None: + """Acquire the module-level lock for serializing access to shared data. This should be released with _releaseLock(). """ @@ -162,20 +161,20 @@ def _acquireLock(): if _lock: _lock.acquire() -def _releaseLock(): - """ - Release the module-level lock acquired by calling _acquireLock(). - """ + +def _releaseLock() -> None: + """Release the module-level lock acquired by calling _acquireLock().""" if _lock: _lock.release() -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # The logging record -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class LogRecord: - """ - A LogRecord instance represents an event being logged. + """A LogRecord instance represents an event being logged. LogRecord instances are created every time something is logged. They contain all the information pertinent to the event being logged. The @@ -185,10 +184,9 @@ class LogRecord: the source line where the logging call was made, and any exception information to be logged. """ - def __init__(self, name, level, pathname, lineno, msg, args, exc_info): - """ - Initialize a logging record with interesting information. - """ + + def __init__(self, name, level, pathname, lineno, msg, args, exc_info) -> None: + """Initialize a logging record with interesting information.""" ct = time.time() self.name = name self.msg = msg @@ -211,36 +209,34 @@ def __init__(self, name, level, pathname, lineno, msg, args, exc_info): self.thread = thread.get_ident() else: self.thread = None - if hasattr(os, 'getpid'): + if hasattr(os, "getpid"): self.process = os.getpid() else: self.process = None - def __str__(self): - return ''%(self.name, self.levelno, - self.pathname, self.lineno, self.msg) + def __str__(self) -> str: + return f'' def getMessage(self): - """ - Return the message for this LogRecord. + """Return the message for this LogRecord. Return the message for this LogRecord after merging any user-supplied arguments with the message. """ - if not hasattr(types, "UnicodeType"): #if no unicode support... + if not hasattr(types, "UnicodeType"): # if no unicode support... msg = str(self.msg) else: try: msg = str(self.msg) except UnicodeError: - msg = self.msg #Defer encoding till later + msg = self.msg # Defer encoding till later if self.args: msg = msg % self.args return msg + def makeLogRecord(dict): - """ - Make a LogRecord whose attributes are defined by the specified dictionary, + """Make a LogRecord whose attributes are defined by the specified dictionary, This function is useful for converting a logging event received over a socket connection (which is sent as a dictionary) into a LogRecord instance. @@ -249,13 +245,14 @@ def makeLogRecord(dict): rv.__dict__.update(dict) return rv -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Formatter classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class Formatter: - """ - Formatter instances are used to convert a LogRecord to text. + r"""Formatter instances are used to convert a LogRecord to text. Formatters need to know how a LogRecord is constructed. They are responsible for converting a LogRecord to (usually) a string which can @@ -295,9 +292,8 @@ class Formatter: converter = time.localtime - def __init__(self, fmt=None, datefmt=None): - """ - Initialize the formatter with specified format strings. + def __init__(self, fmt=None, datefmt=None) -> None: + """Initialize the formatter with specified format strings. Initialize the formatter either with the specified format string, or a default as described above. Allow for specialized date formatting with @@ -310,8 +306,7 @@ def __init__(self, fmt=None, datefmt=None): self.datefmt = datefmt def formatTime(self, record, datefmt=None): - """ - Return the creation time of the specified LogRecord as formatted text. + """Return the creation time of the specified LogRecord as formatted text. This method should be called from format() by a formatter which wants to make use of a formatted time. This method can be overridden @@ -336,13 +331,13 @@ def formatTime(self, record, datefmt=None): return s def formatException(self, ei): - """ - Format and return the specified exception information as a string. + """Format and return the specified exception information as a string. This default implementation just uses traceback.print_exception() """ import traceback + sio = io.StringIO() traceback.print_exception(ei[0], ei[1], ei[2], None, sio) s = sio.getvalue() @@ -352,8 +347,7 @@ def formatException(self, ei): return s def format(self, record): - """ - Format the specified record as text. + """Format the specified record as text. The record's attribute dictionary is used as the operand to a string formatting operation which yields the returned string. @@ -365,7 +359,7 @@ def format(self, record): formatException() and appended to the message. """ record.message = record.getMessage() - if string.find(self._fmt,"%(asctime)") >= 0: + if string.find(self._fmt, "%(asctime)") >= 0: record.asctime = self.formatTime(record, self.datefmt) s = self._fmt % record.__dict__ if record.exc_info: @@ -374,18 +368,18 @@ def format(self, record): s = s + self.formatException(record.exc_info) return s + # # The default formatter to use when no other is specified # _defaultFormatter = Formatter() + class BufferingFormatter: - """ - A formatter suitable for formatting a number of records. - """ - def __init__(self, linefmt=None): - """ - Optionally specify a formatter which will be used to format each + """A formatter suitable for formatting a number of records.""" + + def __init__(self, linefmt=None) -> None: + """Optionally specify a formatter which will be used to format each individual record. """ if linefmt: @@ -393,22 +387,16 @@ def __init__(self, linefmt=None): else: self.linefmt = _defaultFormatter - def formatHeader(self, records): - """ - Return the header string for the specified records. - """ + def formatHeader(self, records) -> str: + """Return the header string for the specified records.""" return "" - def formatFooter(self, records): - """ - Return the footer string for the specified records. - """ + def formatFooter(self, records) -> str: + """Return the footer string for the specified records.""" return "" def format(self, records): - """ - Format the specified records and return the result as a string. - """ + """Format the specified records and return the result as a string.""" rv = "" if len(records) > 0: rv = rv + self.formatHeader(records) @@ -417,13 +405,14 @@ def format(self, records): rv = rv + self.formatFooter(records) return rv -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Filter classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class Filter: - """ - Filter instances are used to perform arbitrary filtering of LogRecords. + """Filter instances are used to perform arbitrary filtering of LogRecords. Loggers and Handlers can optionally use Filter instances to filter records as desired. The base filter class only allows events which are @@ -432,9 +421,9 @@ class Filter: "A.B.C", "A.B.C.D", "A.B.D" etc. but not "A.BB", "B.A.B" etc. If initialized with the empty string, all events are passed. """ - def __init__(self, name=''): - """ - Initialize a filter. + + def __init__(self, name="") -> None: + """Initialize a filter. Initialize with the name of the logger which, together with its children, will have its events allowed through the filter. If no @@ -444,48 +433,39 @@ def __init__(self, name=''): self.nlen = len(name) def filter(self, record): - """ - Determine if the specified record is to be logged. + """Determine if the specified record is to be logged. Is the specified record to be logged? Returns 0 for no, nonzero for yes. If deemed appropriate, the record may be modified in-place. """ - if self.nlen == 0: - return 1 - elif self.name == record.name: + if self.nlen == 0 or self.name == record.name: return 1 elif string.find(record.name, self.name, 0, self.nlen) != 0: return 0 - return (record.name[self.nlen] == ".") + return record.name[self.nlen] == "." + class Filterer: - """ - A base class for loggers and handlers which allows them to share + """A base class for loggers and handlers which allows them to share common code. """ - def __init__(self): - """ - Initialize the list of filters to be an empty list. - """ + + def __init__(self) -> None: + """Initialize the list of filters to be an empty list.""" self.filters = [] - def addFilter(self, filter): - """ - Add the specified filter to this handler. - """ - if not (filter in self.filters): + def addFilter(self, filter) -> None: + """Add the specified filter to this handler.""" + if filter not in self.filters: self.filters.append(filter) - def removeFilter(self, filter): - """ - Remove the specified filter from this handler. - """ + def removeFilter(self, filter) -> None: + """Remove the specified filter from this handler.""" if filter in self.filters: self.filters.remove(filter) def filter(self, record): - """ - Determine if a record is loggable by consulting all the filters. + """Determine if a record is loggable by consulting all the filters. The default is to allow the record to be logged; any filter can veto this and the record is then dropped. Returns a zero value if a record @@ -498,91 +478,79 @@ def filter(self, record): break return rv -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Handler classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + +_handlers = {} # repository of handlers (for flushing when shutdown called) -_handlers = {} #repository of handlers (for flushing when shutdown called) class Handler(Filterer): - """ - Handler instances dispatch logging events to specific destinations. + """Handler instances dispatch logging events to specific destinations. The base handler class. Acts as a placeholder which defines the Handler interface. Handlers can optionally use Formatter instances to format records as desired. By default, no formatter is specified; in this case, the 'raw' message as determined by record.message is logged. """ - def __init__(self, level=NOTSET): - """ - Initializes the instance - basically setting the formatter to None + + def __init__(self, level=NOTSET) -> None: + """Initializes the instance - basically setting the formatter to None and the filter list to empty. """ Filterer.__init__(self) self.level = level self.formatter = None - #get the module data lock, as we're updating a shared structure. + # get the module data lock, as we're updating a shared structure. _acquireLock() - try: #unlikely to raise an exception, but you never know... + try: # unlikely to raise an exception, but you never know... _handlers[self] = 1 finally: _releaseLock() self.createLock() - def createLock(self): - """ - Acquire a thread lock for serializing access to the underlying I/O. - """ + def createLock(self) -> None: + """Acquire a thread lock for serializing access to the underlying I/O.""" if thread: self.lock = thread.allocate_lock() else: self.lock = None - def acquire(self): - """ - Acquire the I/O thread lock. - """ + def acquire(self) -> None: + """Acquire the I/O thread lock.""" if self.lock: self.lock.acquire() - def release(self): - """ - Release the I/O thread lock. - """ + def release(self) -> None: + """Release the I/O thread lock.""" if self.lock: self.lock.release() - def setLevel(self, level): - """ - Set the logging level of this handler. - """ + def setLevel(self, level) -> None: + """Set the logging level of this handler.""" self.level = level def format(self, record): - """ - Format the specified record. + """Format the specified record. If a formatter is set, use it. Otherwise, use the default formatter for the module. """ - if self.formatter: - fmt = self.formatter - else: - fmt = _defaultFormatter + fmt = self.formatter if self.formatter else _defaultFormatter return fmt.format(record) - def emit(self, record): - """ - Do whatever it takes to actually log the specified logging record. + def emit(self, record) -> Never: + """Do whatever it takes to actually log the specified logging record. This version is intended to be implemented by subclasses and so raises a NotImplementedError. """ - raise NotImplementedError('emit must be implemented by Handler subclasses') + msg = "emit must be implemented by Handler subclasses" + raise NotImplementedError(msg) def handle(self, record): - """ - Conditionally emit the specified logging record. + """Conditionally emit the specified logging record. Emission depends on filters which may have been added to the handler. Wrap the actual emission of the record with acquisition/release of @@ -598,33 +566,26 @@ def handle(self, record): self.release() return rv - def setFormatter(self, fmt): - """ - Set the formatter for this handler. - """ + def setFormatter(self, fmt) -> None: + """Set the formatter for this handler.""" self.formatter = fmt - def flush(self): - """ - Ensure all logging output has been flushed. + def flush(self) -> None: + """Ensure all logging output has been flushed. This version does nothing and is intended to be implemented by subclasses. """ - pass - def close(self): - """ - Tidy up any resources used by the handler. + def close(self) -> None: + """Tidy up any resources used by the handler. This version does nothing and is intended to be implemented by subclasses. """ - pass - def handleError(self, record): - """ - Handle errors which occur during an emit() call. + def handleError(self, record) -> None: + """Handle errors which occur during an emit() call. This method should be called from handlers when an exception is encountered during an emit() call. If raiseExceptions is false, @@ -636,19 +597,20 @@ def handleError(self, record): """ if raiseExceptions: import traceback + ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) del ei + class StreamHandler(Handler): - """ - A handler class which writes logging records, appropriately formatted, + """A handler class which writes logging records, appropriately formatted, to a stream. Note that this class does not close the stream, as sys.stdout or sys.stderr may be used. """ - def __init__(self, strm=None): - """ - Initialize the handler. + + def __init__(self, strm=None) -> None: + """Initialize the handler. If strm is not specified, sys.stderr is used. """ @@ -658,15 +620,12 @@ def __init__(self, strm=None): self.stream = strm self.formatter = None - def flush(self): - """ - Flushes the stream. - """ + def flush(self) -> None: + """Flushes the stream.""" self.stream.flush() - def emit(self, record): - """ - Emit a record. + def emit(self, record) -> None: + """Emit a record. If a formatter is specified, it is used to format the record. The record is then written to the stream with a trailing newline @@ -676,92 +635,85 @@ def emit(self, record): """ try: msg = self.format(record) - if not hasattr(types, "UnicodeType"): #if no unicode support... - self.stream.write("%s\n" % msg) + if not hasattr(types, "UnicodeType"): # if no unicode support... + self.stream.write(f"{msg}\n") else: try: - self.stream.write("%s\n" % msg) + self.stream.write(f"{msg}\n") except UnicodeError: - self.stream.write("%s\n" % msg.encode("UTF-8")) + self.stream.write("{}\n".format(msg.encode("UTF-8"))) self.flush() except: self.handleError(record) + class FileHandler(StreamHandler): - """ - A handler class which writes formatted logging records to disk files. - """ - def __init__(self, filename, mode="a"): - """ - Open the specified file and use it as the stream for logging. - """ + """A handler class which writes formatted logging records to disk files.""" + + def __init__(self, filename, mode="a") -> None: + """Open the specified file and use it as the stream for logging.""" StreamHandler.__init__(self, open(filename, mode)) self.baseFilename = filename self.mode = mode - def close(self): - """ - Closes the stream. - """ + def close(self) -> None: + """Closes the stream.""" self.stream.close() -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Manager classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class PlaceHolder: - """ - PlaceHolder instances are used in the Manager logger hierarchy to take + """PlaceHolder instances are used in the Manager logger hierarchy to take the place of nodes for which no loggers have been defined [FIXME add example]. """ - def __init__(self, alogger): - """ - Initialize with the specified logger being a child of this placeholder. - """ + + def __init__(self, alogger) -> None: + """Initialize with the specified logger being a child of this placeholder.""" self.loggers = [alogger] - def append(self, alogger): - """ - Add the specified logger as a child of this placeholder. - """ + def append(self, alogger) -> None: + """Add the specified logger as a child of this placeholder.""" if alogger not in self.loggers: self.loggers.append(alogger) + # # Determine which class to use when instantiating loggers. # _loggerClass = None -def setLoggerClass(klass): - """ - Set the class to be used when instantiating a logger. The class should + +def setLoggerClass(klass) -> None: + """Set the class to be used when instantiating a logger. The class should define __init__() such that only a name argument is required, and the - __init__() should call Logger.__init__() + __init__() should call Logger.__init__(). """ - if klass != Logger: - if not issubclass(klass, Logger): - raise TypeError(f"logger not derived from logging.Logger: {klass.__name__}") + if klass != Logger and not issubclass(klass, Logger): + msg = f"logger not derived from logging.Logger: {klass.__name__}" + raise TypeError(msg) global _loggerClass _loggerClass = klass + class Manager: - """ - There is [under normal circumstances] just one Manager instance, which + """There is [under normal circumstances] just one Manager instance, which holds the hierarchy of loggers. """ - def __init__(self, rootnode): - """ - Initialize the manager with the root node of the logger hierarchy. - """ + + def __init__(self, rootnode) -> None: + """Initialize the manager with the root node of the logger hierarchy.""" self.root = rootnode self.disable = 0 self.emittedNoHandlerWarning = 0 self.loggerDict = {} def getLogger(self, name): - """ - Get a logger with the specified name (channel name), creating it + """Get a logger with the specified name (channel name), creating it if it doesn't yet exist. If a PlaceHolder existed for the specified name [i.e. the logger @@ -790,9 +742,8 @@ def getLogger(self, name): _releaseLock() return rv - def _fixupParents(self, alogger): - """ - Ensure that there are either loggers or placeholders all the way + def _fixupParents(self, alogger) -> None: + """Ensure that there are either loggers or placeholders all the way from the specified logger to the root of the logger hierarchy. """ name = alogger.name @@ -814,9 +765,8 @@ def _fixupParents(self, alogger): rv = self.root alogger.parent = rv - def _fixupChildren(self, ph, alogger): - """ - Ensure that children of the placeholder ph are connected to the + def _fixupChildren(self, ph, alogger) -> None: + """Ensure that children of the placeholder ph are connected to the specified logger. """ for c in ph.loggers: @@ -824,13 +774,14 @@ def _fixupChildren(self, ph, alogger): alogger.parent = c.parent c.parent = alogger -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Logger classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + class Logger(Filterer): - """ - Instances of the Logger class represent a single logging channel. A + """Instances of the Logger class represent a single logging channel. A "logging channel" indicates an area of an application. Exactly how an "area" is defined is up to the application developer. Since an application can have any number of areas, logging channels are identified @@ -843,10 +794,9 @@ class Logger(Filterer): level, and "input.csv", "input.xls" and "input.gnu" for the sub-levels. There is no arbitrary limit to the depth of nesting. """ - def __init__(self, name, level=NOTSET): - """ - Initialize the logger with a name and an optional level. - """ + + def __init__(self, name, level=NOTSET) -> None: + """Initialize the logger with a name and an optional level.""" Filterer.__init__(self) self.name = name self.level = level @@ -855,21 +805,18 @@ def __init__(self, name, level=NOTSET): self.handlers = [] self.disabled = 0 - def setLevel(self, level): - """ - Set the logging level of this logger. - """ + def setLevel(self, level) -> None: + """Set the logging level of this logger.""" self.level = level -# def getRoot(self): -# """ -# Get the root of the logger hierarchy. -# """ -# return Logger.root + # def getRoot(self): + # """ + # Get the root of the logger hierarchy. + # """ + # return Logger.root - def debug(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'DEBUG'. + def debug(self, msg, *args, **kwargs) -> None: + """Log 'msg % args' with severity 'DEBUG'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -878,12 +825,11 @@ def debug(self, msg, *args, **kwargs): """ if self.manager.disable >= DEBUG: return - if DEBUG >= self.getEffectiveLevel(): + if self.getEffectiveLevel() <= DEBUG: self._log(DEBUG, msg, *args, **kwargs) - def info(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'INFO'. + def info(self, msg, *args, **kwargs) -> None: + """Log 'msg % args' with severity 'INFO'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -892,12 +838,11 @@ def info(self, msg, *args, **kwargs): """ if self.manager.disable >= INFO: return - if INFO >= self.getEffectiveLevel(): + if self.getEffectiveLevel() <= INFO: self._log(INFO, msg, args, **kwargs) - def warning(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'WARNING'. + def warning(self, msg, *args, **kwargs) -> None: + """Log 'msg % args' with severity 'WARNING'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -911,9 +856,8 @@ def warning(self, msg, *args, **kwargs): warn = warning - def error(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'ERROR'. + def error(self, msg, *args, **kwargs) -> None: + """Log 'msg % args' with severity 'ERROR'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -925,15 +869,12 @@ def error(self, msg, *args, **kwargs): if self.isEnabledFor(ERROR): self._log(ERROR, msg, args, **kwargs) - def exception(self, msg, *args): - """ - Convenience method for logging an ERROR with exception information. - """ + def exception(self, msg, *args) -> None: + """Convenience method for logging an ERROR with exception information.""" self.error(msg, *args, exc_info=True) - def critical(self, msg, *args, **kwargs): - """ - Log 'msg % args' with severity 'CRITICAL'. + def critical(self, msg, *args, **kwargs) -> None: + """Log 'msg % args' with severity 'CRITICAL'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -942,14 +883,13 @@ def critical(self, msg, *args, **kwargs): """ if self.manager.disable >= CRITICAL: return - if CRITICAL >= self.getEffectiveLevel(): + if self.getEffectiveLevel() <= CRITICAL: self._log(CRITICAL, msg, *args, **kwargs) fatal = critical - def log(self, level, msg, *args, **kwargs): - """ - Log 'msg % args' with the severity 'level'. + def log(self, level, msg, *args, **kwargs) -> None: + """Log 'msg % args' with the severity 'level'. To pass exception information, use the keyword argument exc_info with a true value, e.g. @@ -962,8 +902,7 @@ def log(self, level, msg, *args, **kwargs): self._log(level, msg, args, **kwargs) def findCaller(self): - """ - Find the stack frame of the caller so that we can note the source + """Find the stack frame of the caller so that we can note the source file name and line number. """ f = sys._getframe(1) @@ -974,17 +913,16 @@ def findCaller(self): f = f.f_back continue return filename, f.f_lineno + return None def makeRecord(self, name, level, fn, lno, msg, args, exc_info): - """ - A factory method which can be overridden in subclasses to create + """A factory method which can be overridden in subclasses to create specialized LogRecords. """ return LogRecord(name, level, fn, lno, msg, args, exc_info) - def _log(self, level, msg, args, exc_info=None): - """ - Low-level logging routine which creates a LogRecord and then calls + def _log(self, level, msg, args, exc_info=None) -> None: + """Low-level logging routine which creates a LogRecord and then calls all the handlers of this logger to handle the record. """ if _srcfile: @@ -996,9 +934,8 @@ def _log(self, level, msg, args, exc_info=None): record = self.makeRecord(self.name, level, fn, lno, msg, args, exc_info) self.handle(record) - def handle(self, record): - """ - Call the handlers for the specified record. + def handle(self, record) -> None: + """Call the handlers for the specified record. This method is used for unpickled records received from a socket, as well as those created locally. Logger-level filtering is applied. @@ -1006,24 +943,19 @@ def handle(self, record): if (not self.disabled) and self.filter(record): self.callHandlers(record) - def addHandler(self, hdlr): - """ - Add the specified handler to this logger. - """ - if not (hdlr in self.handlers): + def addHandler(self, hdlr) -> None: + """Add the specified handler to this logger.""" + if hdlr not in self.handlers: self.handlers.append(hdlr) - def removeHandler(self, hdlr): - """ - Remove the specified handler from this logger. - """ + def removeHandler(self, hdlr) -> None: + """Remove the specified handler from this logger.""" if hdlr in self.handlers: - #hdlr.close() + # hdlr.close() self.handlers.remove(hdlr) - def callHandlers(self, record): - """ - Pass a record to all relevant handlers. + def callHandlers(self, record) -> None: + """Pass a record to all relevant handlers. Loop through all handlers for this logger and its parents in the logger hierarchy. If no handler was found, output a one-off error @@ -1039,17 +971,15 @@ def callHandlers(self, record): if record.levelno >= hdlr.level: hdlr.handle(record) if not c.propagate: - c = None #break out + c = None # break out else: c = c.parent if (found == 0) and not self.manager.emittedNoHandlerWarning: - sys.stderr.write("No handlers could be found for logger" - " \"%s\"\n" % self.name) + sys.stderr.write("No handlers could be found for logger" f' "{self.name}"\n') self.manager.emittedNoHandlerWarning = 1 def getEffectiveLevel(self): - """ - Get the effective level for this logger. + """Get the effective level for this logger. Loop through this logger and its parents in the logger hierarchy, looking for a non-zero logging level. Return the first one found. @@ -1062,40 +992,38 @@ def getEffectiveLevel(self): return NOTSET def isEnabledFor(self, level): - """ - Is this logger enabled for level 'level'? - """ + """Is this logger enabled for level 'level'?.""" if self.manager.disable >= level: return 0 return level >= self.getEffectiveLevel() + class RootLogger(Logger): - """ - A root logger is not that different to any other logger, except that + """A root logger is not that different to any other logger, except that it must have a logging level and there is only one instance of it in the hierarchy. """ - def __init__(self, level): - """ - Initialize the logger with the name "root". - """ + + def __init__(self, level) -> None: + """Initialize the logger with the name "root".""" Logger.__init__(self, "root", level) + _loggerClass = Logger root = RootLogger(WARNING) Logger.root = root Logger.manager = Manager(Logger.root) -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- # Configuration classes and functions -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- BASIC_FORMAT = "%(levelname)s:%(name)s:%(message)s" -def basicConfig(): - """ - Do basic configuration for the logging system by creating a + +def basicConfig() -> None: + """Do basic configuration for the logging system by creating a StreamHandler with a default Formatter and adding it to the root logger. """ @@ -1105,14 +1033,15 @@ def basicConfig(): hdlr.setFormatter(fmt) root.addHandler(hdlr) -#--------------------------------------------------------------------------- + +# --------------------------------------------------------------------------- # Utility functions at module level. # Basically delegate everything to the root logger. -#--------------------------------------------------------------------------- +# --------------------------------------------------------------------------- + def getLogger(name=None): - """ - Return a logger with the specified name, creating it if necessary. + """Return a logger with the specified name, creating it if necessary. If no name is specified, return the root logger. """ @@ -1121,7 +1050,8 @@ def getLogger(name=None): else: return root -#def getRootLogger(): + +# def getRootLogger(): # """ # Return the root logger. # @@ -1130,70 +1060,66 @@ def getLogger(name=None): # """ # return root -def critical(msg, *args, **kwargs): - """ - Log a message with severity 'CRITICAL' on the root logger. - """ + +def critical(msg, *args, **kwargs) -> None: + """Log a message with severity 'CRITICAL' on the root logger.""" if len(root.handlers) == 0: basicConfig() root.critical(msg, *args, **kwargs) + fatal = critical -def error(msg, *args, **kwargs): - """ - Log a message with severity 'ERROR' on the root logger. - """ + +def error(msg, *args, **kwargs) -> None: + """Log a message with severity 'ERROR' on the root logger.""" if len(root.handlers) == 0: basicConfig() root.error(msg, *args, **kwargs) -def exception(msg, *args): - """ - Log a message with severity 'ERROR' on the root logger, + +def exception(msg, *args) -> None: + """Log a message with severity 'ERROR' on the root logger, with exception information. """ error(msg, *args, exc_info=True) -def warning(msg, *args, **kwargs): - """ - Log a message with severity 'WARNING' on the root logger. - """ + +def warning(msg, *args, **kwargs) -> None: + """Log a message with severity 'WARNING' on the root logger.""" if len(root.handlers) == 0: basicConfig() root.warning(msg, *args, **kwargs) + warn = warning -def info(msg, *args, **kwargs): - """ - Log a message with severity 'INFO' on the root logger. - """ + +def info(msg, *args, **kwargs) -> None: + """Log a message with severity 'INFO' on the root logger.""" if len(root.handlers) == 0: basicConfig() root.info(msg, *args, **kwargs) -def debug(msg, *args, **kwargs): - """ - Log a message with severity 'DEBUG' on the root logger. - """ + +def debug(msg, *args, **kwargs) -> None: + """Log a message with severity 'DEBUG' on the root logger.""" if len(root.handlers) == 0: basicConfig() root.debug(msg, *args, **kwargs) -def disable(level): - """ - Disable all logging calls less severe than 'level'. - """ + +def disable(level) -> None: + """Disable all logging calls less severe than 'level'.""" root.manager.disable = level -def shutdown(): - """ - Perform any cleanup actions in the logging system (e.g. flushing + +def shutdown() -> None: + """Perform any cleanup actions in the logging system (e.g. flushing buffers). Should be called at application exit. """ - for h in _handlers.keys(): + for h in _handlers: h.flush() h.close() diff --git a/code/planet/compat_logging/config.py b/code/planet/compat_logging/config.py index 6057906..5382183 100644 --- a/code/planet/compat_logging/config.py +++ b/code/planet/compat_logging/config.py @@ -14,8 +14,7 @@ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -""" -Logging package for Python. Based on PEP 282 and comments thereto in +"""Logging package for Python. Based on PEP 282 and comments thereto in comp.lang.python, and influenced by Apache's log4j system. Should work under Python versions >= 1.5.2, except that source line @@ -29,18 +28,17 @@ import logging import logging.handlers import os -import socket import string import struct import sys -from socketserver import ThreadingTCPServer, StreamRequestHandler import threading +from socketserver import StreamRequestHandler, ThreadingTCPServer DEFAULT_LOGGING_CONFIG_PORT = 9030 if sys.platform == "win32": - RESET_ERROR = 10054 #WSAECONNRESET + RESET_ERROR = 10054 # WSAECONNRESET else: - RESET_ERROR = 104 #ECONNRESET + RESET_ERROR = 104 # ECONNRESET # # The following code implements a socket listener for on-the-fly @@ -49,9 +47,9 @@ # _listener holds the server object doing the listening _listener = None -def fileConfig(fname, defaults=None): - """ - Read the logging configuration from a ConfigParser-format file. + +def fileConfig(fname, defaults=None) -> None: + """Read the logging configuration from a ConfigParser-format file. This can be called several times from an application, allowing an end user the ability to select from various pre-canned configurations (if the @@ -65,49 +63,40 @@ def fileConfig(fname, defaults=None): import configparser cp = configparser.ConfigParser(defaults) - if hasattr(cp, 'readfp') and hasattr(fname, 'readline'): + if hasattr(cp, "readfp") and hasattr(fname, "readline"): cp.readfp(fname) else: cp.read(fname) - #first, do the formatters... + # first, do the formatters... flist = cp.get("formatters", "keys") if len(flist): flist = string.split(flist, ",") formatters = {} for form in flist: - sectname = "formatter_%s" % form + sectname = f"formatter_{form}" opts = cp.options(sectname) - if "format" in opts: - fs = cp.get(sectname, "format", 1) - else: - fs = None - if "datefmt" in opts: - dfs = cp.get(sectname, "datefmt", 1) - else: - dfs = None + fs = cp.get(sectname, "format", 1) if "format" in opts else None + dfs = cp.get(sectname, "datefmt", 1) if "datefmt" in opts else None f = logging.Formatter(fs, dfs) formatters[form] = f - #next, do the handlers... - #critical section... + # next, do the handlers... + # critical section... logging._acquireLock() try: try: - #first, lose the existing handlers... + # first, lose the existing handlers... logging._handlers.clear() - #now set up the new ones... + # now set up the new ones... hlist = cp.get("handlers", "keys") if len(hlist): hlist = string.split(hlist, ",") handlers = {} - fixups = [] #for inter-handler references + fixups = [] # for inter-handler references for hand in hlist: - sectname = "handler_%s" % hand + sectname = f"handler_{hand}" klass = cp.get(sectname, "class") opts = cp.options(sectname) - if "formatter" in opts: - fmt = cp.get(sectname, "formatter") - else: - fmt = "" + fmt = cp.get(sectname, "formatter") if "formatter" in opts else "" klass = eval(klass, vars(logging)) args = cp.get(sectname, "args") args = eval(args, vars(logging)) @@ -117,21 +106,18 @@ def fileConfig(fname, defaults=None): h.setLevel(logging._levelNames[level]) if len(fmt): h.setFormatter(formatters[fmt]) - #temporary hack for FileHandler and MemoryHandler. + # temporary hack for FileHandler and MemoryHandler. if klass == logging.handlers.MemoryHandler: - if "target" in opts: - target = cp.get(sectname,"target") - else: - target = "" - if len(target): #the target handler may not be loaded yet, so keep for later... + target = cp.get(sectname, "target") if "target" in opts else "" + if len(target): # the target handler may not be loaded yet, so keep for later... fixups.append((h, target)) handlers[hand] = h - #now all handlers are loaded, fixup inter-handler references... + # now all handlers are loaded, fixup inter-handler references... for fixup in fixups: h = fixup[0] t = fixup[1] h.setTarget(handlers[t]) - #at last, the loggers...first the root... + # at last, the loggers...first the root... llist = cp.get("loggers", "keys") llist = string.split(llist, ",") llist.remove("root") @@ -149,25 +135,22 @@ def fileConfig(fname, defaults=None): hlist = string.split(hlist, ",") for hand in hlist: log.addHandler(handlers[hand]) - #and now the others... - #we don't want to lose the existing loggers, - #since other threads may have pointers to them. - #existing is set to contain all existing loggers, - #and as we go through the new configuration we - #remove any which are configured. At the end, - #what's left in existing is the set of loggers - #which were in the previous configuration but - #which are not in the new configuration. + # and now the others... + # we don't want to lose the existing loggers, + # since other threads may have pointers to them. + # existing is set to contain all existing loggers, + # and as we go through the new configuration we + # remove any which are configured. At the end, + # what's left in existing is the set of loggers + # which were in the previous configuration but + # which are not in the new configuration. existing = root.manager.loggerDict.keys() - #now set up the new ones... + # now set up the new ones... for log in llist: - sectname = "logger_%s" % log + sectname = f"logger_{log}" qn = cp.get(sectname, "qualname") opts = cp.options(sectname) - if "propagate" in opts: - propagate = cp.getint(sectname, "propagate") - else: - propagate = 1 + propagate = cp.getint(sectname, "propagate") if "propagate" in opts else 1 logger = logging.getLogger(qn) if qn in existing: existing.remove(qn) @@ -183,22 +166,23 @@ def fileConfig(fname, defaults=None): hlist = string.split(hlist, ",") for hand in hlist: logger.addHandler(handlers[hand]) - #Disable any old loggers. There's no point deleting - #them as other threads may continue to hold references - #and by disabling them, you stop them doing any logging. + # Disable any old loggers. There's no point deleting + # them as other threads may continue to hold references + # and by disabling them, you stop them doing any logging. for log in existing: root.manager.loggerDict[log].disabled = 1 except: import traceback + ei = sys.exc_info() traceback.print_exception(ei[0], ei[1], ei[2], None, sys.stderr) del ei finally: logging._releaseLock() + def listen(port=DEFAULT_LOGGING_CONFIG_PORT): - """ - Start up a socket server on the specified port, and listen for new + """Start up a socket server on the specified port, and listen for new configurations. These will be sent as a file suitable for processing by fileConfig(). @@ -207,24 +191,25 @@ def listen(port=DEFAULT_LOGGING_CONFIG_PORT): stopListening(). """ if not threading: - raise NotImplementedError("listen() needs threading to work") + msg = "listen() needs threading to work" + raise NotImplementedError(msg) class ConfigStreamHandler(StreamRequestHandler): - """ - Handler for a logging configuration request. + """Handler for a logging configuration request. It expects a completely new logging configuration and uses fileConfig to install it. """ - def handle(self): - """ - Handle a request. + + def handle(self) -> None: + """Handle a request. Each request is expected to be a 4-byte length, followed by the config file. Uses fileConfig() to do the grunt work. """ import tempfile + try: conn = self.connection chunk = conn.recv(4) @@ -233,18 +218,18 @@ def handle(self): chunk = self.connection.recv(slen) while len(chunk) < slen: chunk = chunk + conn.recv(slen - len(chunk)) - #Apply new configuration. We'd like to be able to - #create a StringIO and pass that in, but unfortunately - #1.5.2 ConfigParser does not support reading file - #objects, only actual files. So we create a temporary - #file and remove it later. + # Apply new configuration. We'd like to be able to + # create a StringIO and pass that in, but unfortunately + # 1.5.2 ConfigParser does not support reading file + # objects, only actual files. So we create a temporary + # file and remove it later. file = tempfile.mktemp(".ini") f = open(file, "w") f.write(chunk) f.close() fileConfig(file) os.remove(file) - except socket.error as e: + except OSError as e: if type(e.args) != tuple: raise else: @@ -253,34 +238,30 @@ def handle(self): raise class ConfigSocketReceiver(ThreadingTCPServer): - """ - A simple TCP socket-based logging config receiver. - """ + """A simple TCP socket-based logging config receiver.""" allow_reuse_address = 1 - def __init__(self, host='localhost', port=DEFAULT_LOGGING_CONFIG_PORT, - handler=None): + def __init__(self, host="localhost", port=DEFAULT_LOGGING_CONFIG_PORT, handler=None) -> None: ThreadingTCPServer.__init__(self, (host, port), handler) logging._acquireLock() self.abort = 0 logging._releaseLock() self.timeout = 1 - def serve_until_stopped(self): + def serve_until_stopped(self) -> None: import select + abort = 0 while not abort: - rd, wr, ex = select.select([self.socket.fileno()], - [], [], - self.timeout) + rd, wr, ex = select.select([self.socket.fileno()], [], [], self.timeout) if rd: self.handle_request() logging._acquireLock() abort = self.abort logging._releaseLock() - def serve(rcvr, hdlr, port): + def serve(rcvr, hdlr, port) -> None: server = rcvr(port=port, handler=hdlr) global _listener logging._acquireLock() @@ -288,14 +269,11 @@ def serve(rcvr, hdlr, port): logging._releaseLock() server.serve_until_stopped() - return threading.Thread(target=serve, - args=(ConfigSocketReceiver, - ConfigStreamHandler, port)) + return threading.Thread(target=serve, args=(ConfigSocketReceiver, ConfigStreamHandler, port)) -def stopListening(): - """ - Stop the listening server which was created with a call to listen(). - """ + +def stopListening() -> None: + """Stop the listening server which was created with a call to listen().""" global _listener if _listener: logging._acquireLock() diff --git a/code/planet/compat_logging/handlers.py b/code/planet/compat_logging/handlers.py index 6b2f3b92..c4d5a2e 100644 --- a/code/planet/compat_logging/handlers.py +++ b/code/planet/compat_logging/handlers.py @@ -14,8 +14,7 @@ # IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT # OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -""" -Logging package for Python. Based on PEP 282 and comments thereto in +"""Logging package for Python. Based on PEP 282 and comments thereto in comp.lang.python, and influenced by Apache's log4j system. Should work under Python versions >= 1.5.2, except that source line @@ -27,29 +26,28 @@ """ import logging -import socket -import types import os +import pickle +import socket import string import struct import time -import pickle +import types # # Some constants... # -DEFAULT_TCP_LOGGING_PORT = 9020 -DEFAULT_UDP_LOGGING_PORT = 9021 -DEFAULT_HTTP_LOGGING_PORT = 9022 -DEFAULT_SOAP_LOGGING_PORT = 9023 -SYSLOG_UDP_PORT = 514 +DEFAULT_TCP_LOGGING_PORT = 9020 +DEFAULT_UDP_LOGGING_PORT = 9021 +DEFAULT_HTTP_LOGGING_PORT = 9022 +DEFAULT_SOAP_LOGGING_PORT = 9023 +SYSLOG_UDP_PORT = 514 class RotatingFileHandler(logging.FileHandler): - def __init__(self, filename, mode="a", maxBytes=0, backupCount=0): - """ - Open the specified file and use it as the stream for logging. + def __init__(self, filename, mode="a", maxBytes=0, backupCount=0) -> None: + """Open the specified file and use it as the stream for logging. By default, the file grows indefinitely. You can specify particular values of maxBytes and backupCount to allow the file to rollover at @@ -74,18 +72,15 @@ def __init__(self, filename, mode="a", maxBytes=0, backupCount=0): if maxBytes > 0: self.mode = "a" - def doRollover(self): - """ - Do a rollover, as described in __init__(). - """ - + def doRollover(self) -> None: + """Do a rollover, as described in __init__().""" self.stream.close() if self.backupCount > 0: for i in range(self.backupCount - 1, 0, -1): sfn = "%s.%d" % (self.baseFilename, i) dfn = "%s.%d" % (self.baseFilename, i + 1) if os.path.exists(sfn): - #print "%s -> %s" % (sfn, dfn) + # print "%s -> %s" % (sfn, dfn) if os.path.exists(dfn): os.remove(dfn) os.rename(sfn, dfn) @@ -93,27 +88,25 @@ def doRollover(self): if os.path.exists(dfn): os.remove(dfn) os.rename(self.baseFilename, dfn) - #print "%s -> %s" % (self.baseFilename, dfn) + # print "%s -> %s" % (self.baseFilename, dfn) self.stream = open(self.baseFilename, "w") - def emit(self, record): - """ - Emit a record. + def emit(self, record) -> None: + """Emit a record. Output the record to the file, catering for rollover as described in doRollover(). """ - if self.maxBytes > 0: # are we rolling over? - msg = "%s\n" % self.format(record) - self.stream.seek(0, 2) #due to non-posix-compliant Windows feature + if self.maxBytes > 0: # are we rolling over? + msg = f"{self.format(record)}\n" + self.stream.seek(0, 2) # due to non-posix-compliant Windows feature if self.stream.tell() + len(msg) >= self.maxBytes: self.doRollover() logging.FileHandler.emit(self, record) class SocketHandler(logging.Handler): - """ - A handler class which writes logging records, in pickle format, to + """A handler class which writes logging records, in pickle format, to a streaming socket. The socket is kept open across logging calls. If the peer resets it, an attempt is made to reconnect on the next call. The pickle which is sent is that of the LogRecord's attribute dictionary @@ -124,9 +117,8 @@ class SocketHandler(logging.Handler): makeLogRecord function. """ - def __init__(self, host, port): - """ - Initializes the handler with a specific host address and port. + def __init__(self, host, port) -> None: + """Initializes the handler with a specific host address and port. The attribute 'closeOnError' is set to 1 - which means that if a socket error occurs, the socket is silently closed and then @@ -139,17 +131,15 @@ def __init__(self, host, port): self.closeOnError = 0 def makeSocket(self): - """ - A factory method which allows subclasses to define the precise + """A factory method which allows subclasses to define the precise type of socket they want. """ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect((self.host, self.port)) return s - def send(self, s): - """ - Send a pickled string to the socket. + def send(self, s) -> None: + """Send a pickled string to the socket. This function allows for partial sends which can happen when the network is busy. @@ -165,19 +155,17 @@ def send(self, s): left = left - sent def makePickle(self, record): - """ - Pickles the record in binary format with a length prefix, and + """Pickles the record in binary format with a length prefix, and returns it ready for transmission across the socket. """ s = pickle.dumps(record.__dict__, 1) - #n = len(s) - #slen = "%c%c" % ((n >> 8) & 0xFF, n & 0xFF) + # n = len(s) + # slen = "%c%c" % ((n >> 8) & 0xFF, n & 0xFF) slen = struct.pack(">L", len(s)) return slen + s - def handleError(self, record): - """ - Handle an error during logging. + def handleError(self, record) -> None: + """Handle an error during logging. An error has occurred during logging. Most likely cause - connection lost. Close the socket so that we can retry on the @@ -185,13 +173,12 @@ def handleError(self, record): """ if self.closeOnError and self.sock: self.sock.close() - self.sock = None #try to reconnect next time + self.sock = None # try to reconnect next time else: logging.Handler.handleError(self, record) - def emit(self, record): - """ - Emit a record. + def emit(self, record) -> None: + """Emit a record. Pickles the record and writes it to the socket in binary format. If there is an error with the socket, silently drop the packet. @@ -206,17 +193,15 @@ def emit(self, record): except: self.handleError(record) - def close(self): - """ - Closes the socket. - """ + def close(self) -> None: + """Closes the socket.""" if self.sock: self.sock.close() self.sock = None + class DatagramHandler(SocketHandler): - """ - A handler class which writes logging records, in pickle format, to + """A handler class which writes logging records, in pickle format, to a datagram socket. The pickle which is sent is that of the LogRecord's attribute dictionary (__dict__), so that the receiver does not need to have the logging module installed in order to process the logging event. @@ -225,24 +210,20 @@ class DatagramHandler(SocketHandler): makeLogRecord function. """ - def __init__(self, host, port): - """ - Initializes the handler with a specific host address and port. - """ + + def __init__(self, host, port) -> None: + """Initializes the handler with a specific host address and port.""" SocketHandler.__init__(self, host, port) self.closeOnError = 0 def makeSocket(self): - """ - The factory method of SocketHandler is here overridden to create + """The factory method of SocketHandler is here overridden to create a UDP socket (SOCK_DGRAM). """ - s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - return s + return socket.socket(socket.AF_INET, socket.SOCK_DGRAM) - def send(self, s): - """ - Send a pickled string to a socket. + def send(self, s) -> None: + """Send a pickled string to a socket. This function no longer allows for partial sends which can happen when the network is busy - UDP does not guarantee delivery and @@ -250,9 +231,9 @@ def send(self, s): """ self.sock.sendto(s, (self.host, self.port)) + class SysLogHandler(logging.Handler): - """ - A handler class which sends formatted logging records to a syslog + """A handler class which sends formatted logging records to a syslog server. Based on Sam Rushing's syslog module: http://www.nightmare.com/squirl/python-ext/misc/syslog.py Contributed by Nicolas Untz (after which minor refactoring changes @@ -269,79 +250,78 @@ class SysLogHandler(logging.Handler): # # priorities (these are ordered) - LOG_EMERG = 0 # system is unusable - LOG_ALERT = 1 # action must be taken immediately - LOG_CRIT = 2 # critical conditions - LOG_ERR = 3 # error conditions - LOG_WARNING = 4 # warning conditions - LOG_NOTICE = 5 # normal but significant condition - LOG_INFO = 6 # informational - LOG_DEBUG = 7 # debug-level messages + LOG_EMERG = 0 # system is unusable + LOG_ALERT = 1 # action must be taken immediately + LOG_CRIT = 2 # critical conditions + LOG_ERR = 3 # error conditions + LOG_WARNING = 4 # warning conditions + LOG_NOTICE = 5 # normal but significant condition + LOG_INFO = 6 # informational + LOG_DEBUG = 7 # debug-level messages # facility codes - LOG_KERN = 0 # kernel messages - LOG_USER = 1 # random user-level messages - LOG_MAIL = 2 # mail system - LOG_DAEMON = 3 # system daemons - LOG_AUTH = 4 # security/authorization messages - LOG_SYSLOG = 5 # messages generated internally by syslogd - LOG_LPR = 6 # line printer subsystem - LOG_NEWS = 7 # network news subsystem - LOG_UUCP = 8 # UUCP subsystem - LOG_CRON = 9 # clock daemon - LOG_AUTHPRIV = 10 # security/authorization messages (private) + LOG_KERN = 0 # kernel messages + LOG_USER = 1 # random user-level messages + LOG_MAIL = 2 # mail system + LOG_DAEMON = 3 # system daemons + LOG_AUTH = 4 # security/authorization messages + LOG_SYSLOG = 5 # messages generated internally by syslogd + LOG_LPR = 6 # line printer subsystem + LOG_NEWS = 7 # network news subsystem + LOG_UUCP = 8 # UUCP subsystem + LOG_CRON = 9 # clock daemon + LOG_AUTHPRIV = 10 # security/authorization messages (private) # other codes through 15 reserved for system use - LOG_LOCAL0 = 16 # reserved for local use - LOG_LOCAL1 = 17 # reserved for local use - LOG_LOCAL2 = 18 # reserved for local use - LOG_LOCAL3 = 19 # reserved for local use - LOG_LOCAL4 = 20 # reserved for local use - LOG_LOCAL5 = 21 # reserved for local use - LOG_LOCAL6 = 22 # reserved for local use - LOG_LOCAL7 = 23 # reserved for local use + LOG_LOCAL0 = 16 # reserved for local use + LOG_LOCAL1 = 17 # reserved for local use + LOG_LOCAL2 = 18 # reserved for local use + LOG_LOCAL3 = 19 # reserved for local use + LOG_LOCAL4 = 20 # reserved for local use + LOG_LOCAL5 = 21 # reserved for local use + LOG_LOCAL6 = 22 # reserved for local use + LOG_LOCAL7 = 23 # reserved for local use priority_names = { - "alert": LOG_ALERT, - "crit": LOG_CRIT, + "alert": LOG_ALERT, + "crit": LOG_CRIT, "critical": LOG_CRIT, - "debug": LOG_DEBUG, - "emerg": LOG_EMERG, - "err": LOG_ERR, - "error": LOG_ERR, # DEPRECATED - "info": LOG_INFO, - "notice": LOG_NOTICE, - "panic": LOG_EMERG, # DEPRECATED - "warn": LOG_WARNING, # DEPRECATED - "warning": LOG_WARNING, - } + "debug": LOG_DEBUG, + "emerg": LOG_EMERG, + "err": LOG_ERR, + "error": LOG_ERR, # DEPRECATED + "info": LOG_INFO, + "notice": LOG_NOTICE, + "panic": LOG_EMERG, # DEPRECATED + "warn": LOG_WARNING, # DEPRECATED + "warning": LOG_WARNING, + } facility_names = { - "auth": LOG_AUTH, + "auth": LOG_AUTH, "authpriv": LOG_AUTHPRIV, - "cron": LOG_CRON, - "daemon": LOG_DAEMON, - "kern": LOG_KERN, - "lpr": LOG_LPR, - "mail": LOG_MAIL, - "news": LOG_NEWS, - "security": LOG_AUTH, # DEPRECATED - "syslog": LOG_SYSLOG, - "user": LOG_USER, - "uucp": LOG_UUCP, - "local0": LOG_LOCAL0, - "local1": LOG_LOCAL1, - "local2": LOG_LOCAL2, - "local3": LOG_LOCAL3, - "local4": LOG_LOCAL4, - "local5": LOG_LOCAL5, - "local6": LOG_LOCAL6, - "local7": LOG_LOCAL7, - } - - def __init__(self, address=('localhost', SYSLOG_UDP_PORT), facility=LOG_USER): - """ - Initialize a handler. + "cron": LOG_CRON, + "daemon": LOG_DAEMON, + "kern": LOG_KERN, + "lpr": LOG_LPR, + "mail": LOG_MAIL, + "news": LOG_NEWS, + "security": LOG_AUTH, # DEPRECATED + "syslog": LOG_SYSLOG, + "user": LOG_USER, + "uucp": LOG_UUCP, + "local0": LOG_LOCAL0, + "local1": LOG_LOCAL1, + "local2": LOG_LOCAL2, + "local3": LOG_LOCAL3, + "local4": LOG_LOCAL4, + "local5": LOG_LOCAL5, + "local6": LOG_LOCAL6, + "local7": LOG_LOCAL7, + } + + def __init__(self, address=("localhost", SYSLOG_UDP_PORT), facility=LOG_USER) -> None: + """Initialize a handler. If address is specified as a string, UNIX socket is used. If facility is not specified, LOG_USER is used. @@ -355,7 +335,7 @@ def __init__(self, address=('localhost', SYSLOG_UDP_PORT), facility=LOG_USER): # syslog may require either DGRAM or STREAM sockets try: self.socket.connect(address) - except socket.error: + except OSError: self.socket.close() self.socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) self.socket.connect(address) @@ -370,11 +350,10 @@ def __init__(self, address=('localhost', SYSLOG_UDP_PORT), facility=LOG_USER): # zero-terminator seems to be required. this string is placed # into a class variable so that it can be overridden if # necessary. - log_format_string = '<%d>%s\000' + log_format_string = "<%d>%s\000" - def encodePriority (self, facility, priority): - """ - Encode the facility and priority. You can pass in strings or + def encodePriority(self, facility, priority): + """Encode the facility and priority. You can pass in strings or integers - if strings are passed, the facility_names and priority_names mapping dictionaries are used to convert them to integers. @@ -385,16 +364,13 @@ def encodePriority (self, facility, priority): priority = self.priority_names[priority] return (facility << 3) | priority - def close (self): - """ - Closes the socket. - """ + def close(self) -> None: + """Closes the socket.""" if self.unixsocket: self.socket.close() - def emit(self, record): - """ - Emit a record. + def emit(self, record) -> None: + """Emit a record. The record is formatted, and then sent to the syslog server. If exception information is present, it is NOT sent to the server. @@ -404,10 +380,7 @@ def emit(self, record): We need to convert record level to lowercase, maybe this will change in the future. """ - msg = self.log_format_string % ( - self.encodePriority(self.facility, - string.lower(record.levelname)), - msg) + msg = self.log_format_string % (self.encodePriority(self.facility, string.lower(record.levelname)), msg) try: if self.unixsocket: self.socket.send(msg) @@ -416,13 +389,12 @@ def emit(self, record): except: self.handleError(record) + class SMTPHandler(logging.Handler): - """ - A handler class which sends an SMTP email for each logging event. - """ - def __init__(self, mailhost, fromaddr, toaddrs, subject): - """ - Initialize the handler. + """A handler class which sends an SMTP email for each logging event.""" + + def __init__(self, mailhost, fromaddr, toaddrs, subject) -> None: + """Initialize the handler. Initialize the instance with the from and to addresses and subject line of the email. To specify a non-standard SMTP port, use the @@ -443,55 +415,54 @@ def __init__(self, mailhost, fromaddr, toaddrs, subject): self.subject = subject def getSubject(self, record): - """ - Determine the subject for the email. + """Determine the subject for the email. If you want to specify a subject line which is record-dependent, override this method. """ return self.subject - weekdayname = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + weekdayname = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] - monthname = [None, - 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', - 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + monthname = [None, "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] - def date_time(self): + def date_time(self) -> str: """Return the current date and time formatted for a MIME header.""" year, month, day, hh, mm, ss, wd, y, z = time.gmtime(time.time()) - s = "%s, %02d %3s %4d %02d:%02d:%02d GMT" % ( - self.weekdayname[wd], - day, self.monthname[month], year, - hh, mm, ss) - return s - - def emit(self, record): - """ - Emit a record. + return "%s, %02d %3s %4d %02d:%02d:%02d GMT" % ( + self.weekdayname[wd], + day, + self.monthname[month], + year, + hh, + mm, + ss, + ) + + def emit(self, record) -> None: + """Emit a record. Format the record and send it to the specified addressees. """ try: import smtplib + port = self.mailport if not port: port = smtplib.SMTP_PORT smtp = smtplib.SMTP(self.mailhost, port) msg = self.format(record) - msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\nDate: %s\r\n\r\n%s" % ( - self.fromaddr, - string.join(self.toaddrs, ","), - self.getSubject(record), - self.date_time(), msg) + msg = "From: {}\r\nTo: {}\r\nSubject: {}\r\nDate: {}\r\n\r\n{}".format( + self.fromaddr, string.join(self.toaddrs, ","), self.getSubject(record), self.date_time(), msg + ) smtp.sendmail(self.fromaddr, self.toaddrs, msg) smtp.quit() except: self.handleError(record) + class NTEventLogHandler(logging.Handler): - """ - A handler class which sends events to the NT Event Log. Adds a + """A handler class which sends events to the NT Event Log. Adds a registry entry for the specified application name. If no dllname is provided, win32service.pyd (which contains some basic message placeholders) is used. Note that use of these placeholders will make @@ -499,35 +470,35 @@ class NTEventLogHandler(logging.Handler): If you want slimmer logs, you have to pass in the name of your own DLL which contains the message definitions you want to use in the event log. """ - def __init__(self, appname, dllname=None, logtype="Application"): + + def __init__(self, appname, dllname=None, logtype="Application") -> None: logging.Handler.__init__(self) try: - import win32evtlogutil, win32evtlog + import win32evtlog + import win32evtlogutil + self.appname = appname self._welu = win32evtlogutil if not dllname: dllname = os.path.split(self._welu.__file__) dllname = os.path.split(dllname[0]) - dllname = os.path.join(dllname[0], r'win32service.pyd') + dllname = os.path.join(dllname[0], r"win32service.pyd") self.dllname = dllname self.logtype = logtype self._welu.AddSourceToRegistry(appname, dllname, logtype) self.deftype = win32evtlog.EVENTLOG_ERROR_TYPE self.typemap = { - logging.DEBUG : win32evtlog.EVENTLOG_INFORMATION_TYPE, - logging.INFO : win32evtlog.EVENTLOG_INFORMATION_TYPE, - logging.WARNING : win32evtlog.EVENTLOG_WARNING_TYPE, - logging.ERROR : win32evtlog.EVENTLOG_ERROR_TYPE, + logging.DEBUG: win32evtlog.EVENTLOG_INFORMATION_TYPE, + logging.INFO: win32evtlog.EVENTLOG_INFORMATION_TYPE, + logging.WARNING: win32evtlog.EVENTLOG_WARNING_TYPE, + logging.ERROR: win32evtlog.EVENTLOG_ERROR_TYPE, logging.CRITICAL: win32evtlog.EVENTLOG_ERROR_TYPE, - } + } except ImportError: - print("The Python Win32 extensions for NT (service, event "\ - "logging) appear not to be available.") self._welu = None - def getMessageID(self, record): - """ - Return the message ID for the event record. If you are using your + def getMessageID(self, record) -> int: + """Return the message ID for the event record. If you are using your own messages, you could do this by having the msg passed to the logger being an ID rather than a formatting string. Then, in here, you could use a dictionary lookup to get the message ID. This @@ -535,9 +506,8 @@ def getMessageID(self, record): """ return 1 - def getEventCategory(self, record): - """ - Return the event category for the record. + def getEventCategory(self, record) -> int: + """Return the event category for the record. Override this if you want to specify your own categories. This version returns 0. @@ -545,8 +515,7 @@ def getEventCategory(self, record): return 0 def getEventType(self, record): - """ - Return the event type for the record. + """Return the event type for the record. Override this if you want to specify your own types. This version does a mapping using the handler's typemap attribute, which is set up in @@ -557,9 +526,8 @@ def getEventType(self, record): """ return self.typemap.get(record.levelno, self.deftype) - def emit(self, record): - """ - Emit a record. + def emit(self, record) -> None: + """Emit a record. Determine the message ID, event category and event type. Then log the message in the NT event log. @@ -574,9 +542,8 @@ def emit(self, record): except: self.handleError(record) - def close(self): - """ - Clean up this handler. + def close(self) -> None: + """Clean up this handler. You can remove the application name from the registry as a source of event log entries. However, if you do this, you will @@ -584,51 +551,49 @@ def close(self): Viewer - it needs to be able to access the registry to get the DLL name. """ - #self._welu.RemoveSourceFromRegistry(self.appname, self.logtype) - pass + # self._welu.RemoveSourceFromRegistry(self.appname, self.logtype) + class HTTPHandler(logging.Handler): - """ - A class which sends records to a Web server, using either GET or + """A class which sends records to a Web server, using either GET or POST semantics. """ - def __init__(self, host, url, method="GET"): - """ - Initialize the instance with the host, the request URL, and the method - ("GET" or "POST") + + def __init__(self, host, url, method="GET") -> None: + """Initialize the instance with the host, the request URL, and the method + ("GET" or "POST"). """ logging.Handler.__init__(self) method = string.upper(method) if method not in ["GET", "POST"]: - raise ValueError("method must be GET or POST") + msg = "method must be GET or POST" + raise ValueError(msg) self.host = host self.url = url self.method = method def mapLogRecord(self, record): - """ - Default implementation of mapping the log record into a dict + """Default implementation of mapping the log record into a dict that is send as the CGI data. Overwrite in your class. Contributed by Franz Glasner. """ return record.__dict__ - def emit(self, record): - """ - Emit a record. + def emit(self, record) -> None: + """Emit a record. Send the record to the Web server as an URL-encoded dictionary """ try: - import httplib, urllib + import urllib + + import httplib + h = httplib.HTTP(self.host) url = self.url data = urllib.urlencode(self.mapLogRecord(record)) if self.method == "GET": - if (string.find(url, '?') >= 0): - sep = '&' - else: - sep = '?' + sep = "&" if string.find(url, "?") >= 0 else "?" url = url + "%c%s" % (sep, data) h.putrequest(self.method, url) if self.method == "POST": @@ -636,36 +601,33 @@ def emit(self, record): h.endheaders() if self.method == "POST": h.send(data) - h.getreply() #can't do anything with the result + h.getreply() # can't do anything with the result except: self.handleError(record) + class BufferingHandler(logging.Handler): + """A handler class which buffers logging records in memory. Whenever each + record is added to the buffer, a check is made to see if the buffer should + be flushed. If it should, then flush() is expected to do what's needed. """ - A handler class which buffers logging records in memory. Whenever each - record is added to the buffer, a check is made to see if the buffer should - be flushed. If it should, then flush() is expected to do what's needed. - """ - def __init__(self, capacity): - """ - Initialize the handler with the buffer size. - """ + + def __init__(self, capacity) -> None: + """Initialize the handler with the buffer size.""" logging.Handler.__init__(self) self.capacity = capacity self.buffer = [] def shouldFlush(self, record): - """ - Should the handler flush its buffer? + """Should the handler flush its buffer?. Returns true if the buffer is up to capacity. This method can be overridden to implement custom flushing strategies. """ - return (len(self.buffer) >= self.capacity) + return len(self.buffer) >= self.capacity - def emit(self, record): - """ - Emit a record. + def emit(self, record) -> None: + """Emit a record. Append the record. If shouldFlush() tells us to, call flush() to process the buffer. @@ -674,23 +636,22 @@ def emit(self, record): if self.shouldFlush(record): self.flush() - def flush(self): - """ - Override to implement custom flushing behaviour. + def flush(self) -> None: + """Override to implement custom flushing behaviour. This version just zaps the buffer to empty. """ self.buffer = [] + class MemoryHandler(BufferingHandler): - """ - A handler class which buffers logging records in memory, periodically + """A handler class which buffers logging records in memory, periodically flushing them to a target handler. Flushing occurs whenever the buffer is full, or when an event of a certain severity or greater is seen. """ - def __init__(self, capacity, flushLevel=logging.ERROR, target=None): - """ - Initialize the handler with the buffer size, the level at which + + def __init__(self, capacity, flushLevel=logging.ERROR, target=None) -> None: + """Initialize the handler with the buffer size, the level at which flushing should occur and an optional target. Note that without a target being set either here or via setTarget(), @@ -701,21 +662,15 @@ def __init__(self, capacity, flushLevel=logging.ERROR, target=None): self.target = target def shouldFlush(self, record): - """ - Check for buffer full or a record at the flushLevel or higher. - """ - return (len(self.buffer) >= self.capacity) or \ - (record.levelno >= self.flushLevel) + """Check for buffer full or a record at the flushLevel or higher.""" + return (len(self.buffer) >= self.capacity) or (record.levelno >= self.flushLevel) - def setTarget(self, target): - """ - Set the target handler for this handler. - """ + def setTarget(self, target) -> None: + """Set the target handler for this handler.""" self.target = target - def flush(self): - """ - For a MemoryHandler, flushing means just sending the buffered + def flush(self) -> None: + """For a MemoryHandler, flushing means just sending the buffered records to the target, if there is one. Override if you want different behaviour. """ @@ -724,10 +679,8 @@ def flush(self): self.target.handle(record) self.buffer = [] - def close(self): - """ - Flush, set the target to None and lose the buffer. - """ + def close(self) -> None: + """Flush, set the target to None and lose the buffer.""" self.flush() self.target = None self.buffer = [] diff --git a/code/planet/feedparser.py b/code/planet/feedparser.py index f38dc6f..6893dbf 100644 --- a/code/planet/feedparser.py +++ b/code/planet/feedparser.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Universal feed parser +"""Universal feed parser. Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds @@ -13,7 +13,7 @@ TODO: py2->3 conversion """ -__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" +__version__ = "4.1" # + "$Revision: 1.92 $"[11:15] + "-cvs" __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. Redistribution and use in source and binary forms, with or without modification, @@ -37,11 +37,13 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.""" __author__ = "Mark Pilgrim " -__contributors__ = ["Jason Diamond ", - "John Beimler ", - "Fazal Majid ", - "Aaron Swartz ", - "Kevin Marks "] +__contributors__ = [ + "Jason Diamond ", + "John Beimler ", + "Fazal Majid ", + "Aaron Swartz ", + "Kevin Marks ", +] import calendar import email.utils @@ -53,7 +55,7 @@ # HTTP "User-Agent" header to send to servers when downloading feeds. # If you are embedding feedparser in a larger application, you should # change this to your application name and URL. -USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ +USER_AGENT = f"UniversalFeedParser/{__version__} +http://feedparser.org/" # HTTP "Accept" header to send to servers when downloading feeds. If you don't # want to send an Accept header, set this to None. @@ -74,15 +76,17 @@ PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] # ---------- required modules (should come with any Python distribution) ---------- +import cgi +import contextlib +import copy import re import sys -import copy import time import types -import cgi import urllib.parse import urllib.request import urllib.response +from typing import Never try: from io import StringIO as _StringIO @@ -107,101 +111,127 @@ # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. try: import xml.sax - xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers + + xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers from xml.sax.saxutils import escape as _xmlescape + _XML_AVAILABLE = 1 except: _XML_AVAILABLE = 0 - def _xmlescape(data,entities={}): - data = data.replace('&', '&') - data = data.replace('>', '>') - data = data.replace('<', '<') + + def _xmlescape(data, entities=None): + if entities is None: + entities = {} + data = data.replace("&", "&") + data = data.replace(">", ">") + data = data.replace("<", "<") for char, entity in entities: data = data.replace(char, entity) return data + # base64 support for Atom feeds that contain embedded binary data try: - import base64, binascii + import base64 + import binascii except: base64 = binascii = None # cjkcodecs and iconv_codec provide support for more character encodings. # Both are available from http://cjkpython.i18n.org/ -try: - import cjkcodecs.aliases -except: +with contextlib.suppress(Exception): pass -try: - import iconv_codec -except: +with contextlib.suppress(Exception): pass # chardet library auto-detects character encodings # Download from http://chardet.feedparser.org/ try: import chardet + if _debug: import chardet.constants + chardet.constants._debug = 1 except: chardet = None + # ---------- don't touch these ---------- -class ThingsNobodyCaresAboutButMe(Exception): pass -class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass -class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass -class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass -class UndeclaredNamespace(Exception): pass - -SUPPORTED_VERSIONS = {'': 'unknown', - 'rss090': 'RSS 0.90', - 'rss091n': 'RSS 0.91 (Netscape)', - 'rss091u': 'RSS 0.91 (Userland)', - 'rss092': 'RSS 0.92', - 'rss093': 'RSS 0.93', - 'rss094': 'RSS 0.94', - 'rss20': 'RSS 2.0', - 'rss10': 'RSS 1.0', - 'rss': 'RSS (unknown version)', - 'atom01': 'Atom 0.1', - 'atom02': 'Atom 0.2', - 'atom03': 'Atom 0.3', - 'atom10': 'Atom 1.0', - 'atom': 'Atom (unknown version)', - 'cdf': 'CDF', - 'hotrss': 'Hot RSS' - } +class ThingsNobodyCaresAboutButMe(Exception): + pass + + +class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): + pass + + +class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): + pass + + +class NonXMLContentType(ThingsNobodyCaresAboutButMe): + pass + + +class UndeclaredNamespace(Exception): + pass + + +SUPPORTED_VERSIONS = { + "": "unknown", + "rss090": "RSS 0.90", + "rss091n": "RSS 0.91 (Netscape)", + "rss091u": "RSS 0.91 (Userland)", + "rss092": "RSS 0.92", + "rss093": "RSS 0.93", + "rss094": "RSS 0.94", + "rss20": "RSS 2.0", + "rss10": "RSS 1.0", + "rss": "RSS (unknown version)", + "atom01": "Atom 0.1", + "atom02": "Atom 0.2", + "atom03": "Atom 0.3", + "atom10": "Atom 1.0", + "atom": "Atom (unknown version)", + "cdf": "CDF", + "hotrss": "Hot RSS", +} try: UserDict = dict except NameError: # Python 2.1 does not have dict from collections import UserDict + def dict(aList): return dict(aList) + class FeedParserDict(UserDict): - keymap = {'channel': 'feed', - 'items': 'entries', - 'guid': 'id', - 'date': 'updated', - 'date_parsed': 'updated_parsed', - 'description': ['subtitle', 'summary'], - 'url': ['href'], - 'modified': 'updated', - 'modified_parsed': 'updated_parsed', - 'issued': 'published', - 'issued_parsed': 'published_parsed', - 'copyright': 'rights', - 'copyright_detail': 'rights_detail', - 'tagline': 'subtitle', - 'tagline_detail': 'subtitle_detail'} + keymap = { + "channel": "feed", + "items": "entries", + "guid": "id", + "date": "updated", + "date_parsed": "updated_parsed", + "description": ["subtitle", "summary"], + "url": ["href"], + "modified": "updated", + "modified_parsed": "updated_parsed", + "issued": "published", + "issued_parsed": "published_parsed", + "copyright": "rights", + "copyright_detail": "rights_detail", + "tagline": "subtitle", + "tagline_detail": "subtitle_detail", + } + def __getitem__(self, key): - if key == 'category': - return UserDict.__getitem__(self, 'tags')[0]['term'] - if key == 'categories': - return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] + if key == "category": + return UserDict.__getitem__(self, "tags")[0]["term"] + if key == "categories": + return [(tag["scheme"], tag["term"]) for tag in UserDict.__getitem__(self, "tags")] realkey = self.keymap.get(key, key) if type(realkey) == types.ListType: for k in realkey: @@ -211,8 +241,8 @@ def __getitem__(self, key): return UserDict.__getitem__(self, key) return UserDict.__getitem__(self, realkey) - def __setitem__(self, key, value): - for k in self.keymap.keys(): + def __setitem__(self, key, value) -> None: + for k in self.keymap: if key == k: key = self.keymap[k] if type(key) == types.ListType: @@ -242,169 +272,454 @@ def __getattr__(self, key): except KeyError: pass try: - assert not key.startswith('_') + assert not key.startswith("_") return self.__getitem__(key) except: - raise AttributeError("object has no attribute '%s'" % key) + msg = f"object has no attribute '{key}'" + raise AttributeError(msg) - def __setattr__(self, key, value): - if key.startswith('_') or key == 'data': + def __setattr__(self, key, value) -> None: + if key.startswith("_") or key == "data": self.__dict__[key] = value + return None else: return self.__setitem__(key, value) - def __contains__(self, key): + def __contains__(self, key) -> bool: return self.has_key(key) -def zopeCompatibilityHack(): + +def zopeCompatibilityHack() -> None: global FeedParserDict del FeedParserDict + def FeedParserDict(aDict=None): rc = {} if aDict: rc.update(aDict) return rc + _ebcdic_to_ascii_map = None + + def _ebcdic_to_ascii(s): global _ebcdic_to_ascii_map if not _ebcdic_to_ascii_map: emap = ( - 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, - 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, - 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, - 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, - 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, - 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, - 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, - 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 - ) + 0, + 1, + 2, + 3, + 156, + 9, + 134, + 127, + 151, + 141, + 142, + 11, + 12, + 13, + 14, + 15, + 16, + 17, + 18, + 19, + 157, + 133, + 8, + 135, + 24, + 25, + 146, + 143, + 28, + 29, + 30, + 31, + 128, + 129, + 130, + 131, + 132, + 10, + 23, + 27, + 136, + 137, + 138, + 139, + 140, + 5, + 6, + 7, + 144, + 145, + 22, + 147, + 148, + 149, + 150, + 4, + 152, + 153, + 154, + 155, + 20, + 21, + 158, + 26, + 32, + 160, + 161, + 162, + 163, + 164, + 165, + 166, + 167, + 168, + 91, + 46, + 60, + 40, + 43, + 33, + 38, + 169, + 170, + 171, + 172, + 173, + 174, + 175, + 176, + 177, + 93, + 36, + 42, + 41, + 59, + 94, + 45, + 47, + 178, + 179, + 180, + 181, + 182, + 183, + 184, + 185, + 124, + 44, + 37, + 95, + 62, + 63, + 186, + 187, + 188, + 189, + 190, + 191, + 192, + 193, + 194, + 96, + 58, + 35, + 64, + 39, + 61, + 34, + 195, + 97, + 98, + 99, + 100, + 101, + 102, + 103, + 104, + 105, + 196, + 197, + 198, + 199, + 200, + 201, + 202, + 106, + 107, + 108, + 109, + 110, + 111, + 112, + 113, + 114, + 203, + 204, + 205, + 206, + 207, + 208, + 209, + 126, + 115, + 116, + 117, + 118, + 119, + 120, + 121, + 122, + 210, + 211, + 212, + 213, + 214, + 215, + 216, + 217, + 218, + 219, + 220, + 221, + 222, + 223, + 224, + 225, + 226, + 227, + 228, + 229, + 230, + 231, + 123, + 65, + 66, + 67, + 68, + 69, + 70, + 71, + 72, + 73, + 232, + 233, + 234, + 235, + 236, + 237, + 125, + 74, + 75, + 76, + 77, + 78, + 79, + 80, + 81, + 82, + 238, + 239, + 240, + 241, + 242, + 243, + 92, + 159, + 83, + 84, + 85, + 86, + 87, + 88, + 89, + 90, + 244, + 245, + 246, + 247, + 248, + 249, + 48, + 49, + 50, + 51, + 52, + 53, + 54, + 55, + 56, + 57, + 250, + 251, + 252, + 253, + 254, + 255, + ) import string - _ebcdic_to_ascii_map = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) + + _ebcdic_to_ascii_map = string.maketrans("".join(map(chr, range(256))), "".join(map(chr, emap))) return s.translate(_ebcdic_to_ascii_map) + cp1252 = { - chr(128): chr(8364), # euro sign - chr(130): chr(8218), # single low-9 quotation mark - chr(131): chr( 402), # latin small letter f with hook - chr(132): chr(8222), # double low-9 quotation mark - chr(133): chr(8230), # horizontal ellipsis - chr(134): chr(8224), # dagger - chr(135): chr(8225), # double dagger - chr(136): chr( 710), # modifier letter circumflex accent - chr(137): chr(8240), # per mille sign - chr(138): chr( 352), # latin capital letter s with caron - chr(139): chr(8249), # single left-pointing angle quotation mark - chr(140): chr( 338), # latin capital ligature oe - chr(142): chr( 381), # latin capital letter z with caron - chr(145): chr(8216), # left single quotation mark - chr(146): chr(8217), # right single quotation mark - chr(147): chr(8220), # left double quotation mark - chr(148): chr(8221), # right double quotation mark - chr(149): chr(8226), # bullet - chr(150): chr(8211), # en dash - chr(151): chr(8212), # em dash - chr(152): chr( 732), # small tilde - chr(153): chr(8482), # trade mark sign - chr(154): chr( 353), # latin small letter s with caron - chr(155): chr(8250), # single right-pointing angle quotation mark - chr(156): chr( 339), # latin small ligature oe - chr(158): chr( 382), # latin small letter z with caron - chr(159): chr( 376)} # latin capital letter y with diaeresis - -_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') + chr(128): chr(8364), # euro sign + chr(130): chr(8218), # single low-9 quotation mark + chr(131): chr(402), # latin small letter f with hook + chr(132): chr(8222), # double low-9 quotation mark + chr(133): chr(8230), # horizontal ellipsis + chr(134): chr(8224), # dagger + chr(135): chr(8225), # double dagger + chr(136): chr(710), # modifier letter circumflex accent + chr(137): chr(8240), # per mille sign + chr(138): chr(352), # latin capital letter s with caron + chr(139): chr(8249), # single left-pointing angle quotation mark + chr(140): chr(338), # latin capital ligature oe + chr(142): chr(381), # latin capital letter z with caron + chr(145): chr(8216), # left single quotation mark + chr(146): chr(8217), # right single quotation mark + chr(147): chr(8220), # left double quotation mark + chr(148): chr(8221), # right double quotation mark + chr(149): chr(8226), # bullet + chr(150): chr(8211), # en dash + chr(151): chr(8212), # em dash + chr(152): chr(732), # small tilde + chr(153): chr(8482), # trade mark sign + chr(154): chr(353), # latin small letter s with caron + chr(155): chr(8250), # single right-pointing angle quotation mark + chr(156): chr(339), # latin small ligature oe + chr(158): chr(382), # latin small letter z with caron + chr(159): chr(376), +} # latin capital letter y with diaeresis + +_urifixer = re.compile("^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)") + + def _urljoin(base, uri): - uri = _urifixer.sub(r'\1\3', uri) + uri = _urifixer.sub(r"\1\3", uri) return urllib.parse.urljoin(base, uri) + class _FeedParserMixin: - namespaces = {'': '', - 'http://backend.userland.com/rss': '', - 'http://blogs.law.harvard.edu/tech/rss': '', - 'http://purl.org/rss/1.0/': '', - 'http://my.netscape.com/rdf/simple/0.9/': '', - 'http://example.com/newformat#': '', - 'http://example.com/necho': '', - 'http://purl.org/echo/': '', - 'uri/of/echo/namespace#': '', - 'http://purl.org/pie/': '', - 'http://purl.org/atom/ns#': '', - 'http://www.w3.org/2005/Atom': '', - 'http://purl.org/rss/1.0/modules/rss091#': '', - - 'http://webns.net/mvcb/': 'admin', - 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', - 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', - 'http://media.tangent.org/rss/1.0/': 'audio', - 'http://backend.userland.com/blogChannelModule': 'blogChannel', - 'http://web.resource.org/cc/': 'cc', - 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', - 'http://purl.org/rss/1.0/modules/company': 'co', - 'http://purl.org/rss/1.0/modules/content/': 'content', - 'http://my.theinfo.org/changed/1.0/rss/': 'cp', - 'http://purl.org/dc/elements/1.1/': 'dc', - 'http://purl.org/dc/terms/': 'dcterms', - 'http://purl.org/rss/1.0/modules/email/': 'email', - 'http://purl.org/rss/1.0/modules/event/': 'ev', - 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', - 'http://freshmeat.net/rss/fm/': 'fm', - 'http://xmlns.com/foaf/0.1/': 'foaf', - 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', - 'http://postneo.com/icbm/': 'icbm', - 'http://purl.org/rss/1.0/modules/image/': 'image', - 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', - 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', - 'http://purl.org/rss/1.0/modules/link/': 'l', - 'http://search.yahoo.com/mrss': 'media', - 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', - 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', - 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', - 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', - 'http://purl.org/rss/1.0/modules/reference/': 'ref', - 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', - 'http://purl.org/rss/1.0/modules/search/': 'search', - 'http://purl.org/rss/1.0/modules/slash/': 'slash', - 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', - 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', - 'http://hacks.benhammersley.com/rss/streaming/': 'str', - 'http://purl.org/rss/1.0/modules/subscription/': 'sub', - 'http://purl.org/rss/1.0/modules/syndication/': 'sy', - 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', - 'http://purl.org/rss/1.0/modules/threading/': 'thr', - 'http://purl.org/rss/1.0/modules/textinput/': 'ti', - 'http://madskills.com/public/xml/rss/module/trackback/':'trackback', - 'http://wellformedweb.org/commentAPI/': 'wfw', - 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', - 'http://www.w3.org/1999/xhtml': 'xhtml', - 'http://www.w3.org/XML/1998/namespace': 'xml', - 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf' -} + namespaces = { + "": "", + "http://backend.userland.com/rss": "", + "http://blogs.law.harvard.edu/tech/rss": "", + "http://purl.org/rss/1.0/": "", + "http://my.netscape.com/rdf/simple/0.9/": "", + "http://example.com/newformat#": "", + "http://example.com/necho": "", + "http://purl.org/echo/": "", + "uri/of/echo/namespace#": "", + "http://purl.org/pie/": "", + "http://purl.org/atom/ns#": "", + "http://www.w3.org/2005/Atom": "", + "http://purl.org/rss/1.0/modules/rss091#": "", + "http://webns.net/mvcb/": "admin", + "http://purl.org/rss/1.0/modules/aggregation/": "ag", + "http://purl.org/rss/1.0/modules/annotate/": "annotate", + "http://media.tangent.org/rss/1.0/": "audio", + "http://backend.userland.com/blogChannelModule": "blogChannel", + "http://web.resource.org/cc/": "cc", + "http://backend.userland.com/creativeCommonsRssModule": "creativeCommons", + "http://purl.org/rss/1.0/modules/company": "co", + "http://purl.org/rss/1.0/modules/content/": "content", + "http://my.theinfo.org/changed/1.0/rss/": "cp", + "http://purl.org/dc/elements/1.1/": "dc", + "http://purl.org/dc/terms/": "dcterms", + "http://purl.org/rss/1.0/modules/email/": "email", + "http://purl.org/rss/1.0/modules/event/": "ev", + "http://rssnamespace.org/feedburner/ext/1.0": "feedburner", + "http://freshmeat.net/rss/fm/": "fm", + "http://xmlns.com/foaf/0.1/": "foaf", + "http://www.w3.org/2003/01/geo/wgs84_pos#": "geo", + "http://postneo.com/icbm/": "icbm", + "http://purl.org/rss/1.0/modules/image/": "image", + "http://www.itunes.com/DTDs/PodCast-1.0.dtd": "itunes", + "http://example.com/DTDs/PodCast-1.0.dtd": "itunes", + "http://purl.org/rss/1.0/modules/link/": "l", + "http://search.yahoo.com/mrss": "media", + "http://madskills.com/public/xml/rss/module/pingback/": "pingback", + "http://prismstandard.org/namespaces/1.2/basic/": "prism", + "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf", + "http://www.w3.org/2000/01/rdf-schema#": "rdfs", + "http://purl.org/rss/1.0/modules/reference/": "ref", + "http://purl.org/rss/1.0/modules/richequiv/": "reqv", + "http://purl.org/rss/1.0/modules/search/": "search", + "http://purl.org/rss/1.0/modules/slash/": "slash", + "http://schemas.xmlsoap.org/soap/envelope/": "soap", + "http://purl.org/rss/1.0/modules/servicestatus/": "ss", + "http://hacks.benhammersley.com/rss/streaming/": "str", + "http://purl.org/rss/1.0/modules/subscription/": "sub", + "http://purl.org/rss/1.0/modules/syndication/": "sy", + "http://purl.org/rss/1.0/modules/taxonomy/": "taxo", + "http://purl.org/rss/1.0/modules/threading/": "thr", + "http://purl.org/rss/1.0/modules/textinput/": "ti", + "http://madskills.com/public/xml/rss/module/trackback/": "trackback", + "http://wellformedweb.org/commentAPI/": "wfw", + "http://purl.org/rss/1.0/modules/wiki/": "wiki", + "http://www.w3.org/1999/xhtml": "xhtml", + "http://www.w3.org/XML/1998/namespace": "xml", + "http://schemas.pocketsoap.com/rss/myDescModule/": "szf", + } _matchnamespaces = {} - can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo'] - can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] - can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] - html_types = ['text/html', 'application/xhtml+xml'] - - def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): - if _debug: sys.stderr.write('initializing FeedParser\n') + can_be_relative_uri = [ + "link", + "id", + "wfw_comment", + "wfw_commentrss", + "docs", + "url", + "href", + "comments", + "license", + "icon", + "logo", + ] + can_contain_relative_uris = [ + "content", + "title", + "summary", + "info", + "tagline", + "subtitle", + "copyright", + "rights", + "description", + ] + can_contain_dangerous_markup = [ + "content", + "title", + "summary", + "info", + "tagline", + "subtitle", + "copyright", + "rights", + "description", + ] + html_types = ["text/html", "application/xhtml+xml"] + + def __init__(self, baseuri=None, baselang=None, encoding="utf-8") -> None: + if _debug: + sys.stderr.write("initializing FeedParser\n") if not self._matchnamespaces: for k, v in self.namespaces.items(): self._matchnamespaces[k.lower()] = v - self.feeddata = FeedParserDict() # feed-level data - self.encoding = encoding # character encoding - self.entries = [] # list of entry-level data - self.version = '' # feed type/version, see SUPPORTED_VERSIONS - self.namespacesInUse = {} # dictionary of namespaces defined by the feed + self.feeddata = FeedParserDict() # feed-level data + self.encoding = encoding # character encoding + self.entries = [] # list of entry-level data + self.version = "" # feed type/version, see SUPPORTED_VERSIONS + self.namespacesInUse = {} # dictionary of namespaces defined by the feed # the following are used internally to track state; # this is really out of control and should be refactored @@ -424,47 +739,51 @@ def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): self.elementstack = [] self.basestack = [] self.langstack = [] - self.baseuri = baseuri or '' + self.baseuri = baseuri or "" self.lang = baselang or None if baselang: - self.feeddata['language'] = baselang + self.feeddata["language"] = baselang def unknown_starttag(self, tag, attrs): - if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) + if _debug: + sys.stderr.write(f"start {tag} with {attrs}\n") # normalize attrs attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] + attrs = [(k, k in ("rel", "type") and v.lower() or v) for k, v in attrs] # track xml:base and xml:lang attrsD = dict(attrs) - baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri + baseuri = attrsD.get("xml:base", attrsD.get("base")) or self.baseuri self.baseuri = _urljoin(self.baseuri, baseuri) - lang = attrsD.get('xml:lang', attrsD.get('lang')) - if lang == '': + lang = attrsD.get("xml:lang", attrsD.get("lang")) + if lang == "": # xml:lang could be explicitly set to '', we need to capture that lang = None elif lang is None: # if no xml:lang is specified, use parent lang lang = self.lang - if lang: - if tag in ('feed', 'rss', 'rdf:RDF'): - self.feeddata['language'] = lang + if lang and tag in ("feed", "rss", "rdf:RDF"): + self.feeddata["language"] = lang self.lang = lang self.basestack.append(self.baseuri) self.langstack.append(lang) # track namespaces for prefix, uri in attrs: - if prefix.startswith('xmlns:'): + if prefix.startswith("xmlns:"): self.trackNamespace(prefix[6:], uri) - elif prefix == 'xmlns': + elif prefix == "xmlns": self.trackNamespace(None, uri) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if ( + self.incontent + and self.contentparams.has_key("type") + and not self.contentparams.get("type", "xml").endswith("xml") + ): # element declared itself as escaped markup, but it isn't really - self.contentparams['type'] = 'application/xhtml+xml' - if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': + self.contentparams["type"] = "application/xhtml+xml" + if self.incontent and self.contentparams.get("type") == "application/xhtml+xml": # Note: probably shouldn't simply recreate localname here, but # our namespace handling isn't actually 100% correct in cases where # the feed redefines the default namespace (which is actually @@ -473,45 +792,46 @@ def unknown_starttag(self, tag, attrs): # because that compensates for the bugs in our namespace handling. # This will horribly munge inline content with non-empty qnames, # but nobody actually does that, so I'm not fixing it. - tag = tag.split(':')[-1] - return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) + tag = tag.split(":")[-1] + return self.handle_data(f"<{tag}{self.strattrs(attrs)}>", escape=0) # match namespaces - if tag.find(':') != -1: - prefix, suffix = tag.split(':', 1) + if tag.find(":") != -1: + prefix, suffix = tag.split(":", 1) else: - prefix, suffix = '', tag + prefix, suffix = "", tag prefix = self.namespacemap.get(prefix, prefix) if prefix: - prefix = prefix + '_' + prefix = prefix + "_" # special hack for better tracking of empty textinput/image elements in illformed feeds - if (not prefix) and tag not in ('title', 'link', 'description', 'name'): + if (not prefix) and tag not in ("title", "link", "description", "name"): self.intextinput = 0 - if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): + if (not prefix) and tag not in ("title", "link", "description", "url", "href", "width", "height"): self.inimage = 0 # call special handler (if defined) or default handler - methodname = '_start_' + prefix + suffix + methodname = "_start_" + prefix + suffix try: method = getattr(self, methodname) return method(attrsD) except AttributeError: return self.push(prefix + suffix, 1) - def unknown_endtag(self, tag): - if _debug: sys.stderr.write('end %s\n' % tag) + def unknown_endtag(self, tag) -> None: + if _debug: + sys.stderr.write(f"end {tag}\n") # match namespaces - if tag.find(':') != -1: - prefix, suffix = tag.split(':', 1) + if tag.find(":") != -1: + prefix, suffix = tag.split(":", 1) else: - prefix, suffix = '', tag + prefix, suffix = "", tag prefix = self.namespacemap.get(prefix, prefix) if prefix: - prefix = prefix + '_' + prefix = prefix + "_" # call special handler (if defined) or default handler - methodname = '_end_' + prefix + suffix + methodname = "_end_" + prefix + suffix try: method = getattr(self, methodname) method() @@ -519,12 +839,16 @@ def unknown_endtag(self, tag): self.pop(prefix + suffix) # track inline content - if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): + if ( + self.incontent + and self.contentparams.has_key("type") + and not self.contentparams.get("type", "xml").endswith("xml") + ): # element declared itself as escaped markup, but it isn't really - self.contentparams['type'] = 'application/xhtml+xml' - if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': - tag = tag.split(':')[-1] - self.handle_data('' % tag, escape=0) + self.contentparams["type"] = "application/xhtml+xml" + if self.incontent and self.contentparams.get("type") == "application/xhtml+xml": + tag = tag.split(":")[-1] + self.handle_data(f"", escape=0) # track xml:base and xml:lang going out of scope if self.basestack: @@ -533,144 +857,156 @@ def unknown_endtag(self, tag): self.baseuri = self.basestack[-1] if self.langstack: self.langstack.pop() - if self.langstack: # and (self.langstack[-1] is not None): + if self.langstack: # and (self.langstack[-1] is not None): self.lang = self.langstack[-1] - def handle_charref(self, ref): + def handle_charref(self, ref) -> None: # called for each character reference, e.g. for ' ', ref will be '160' - if not self.elementstack: return + if not self.elementstack: + return ref = ref.lower() - if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): - text = '&#%s;' % ref + if ref in ("34", "38", "39", "60", "62", "x22", "x26", "x27", "x3c", "x3e"): + text = f"&#{ref};" else: - if ref[0] == 'x': - c = int(ref[1:], 16) - else: - c = int(ref) - text = chr(c).encode('utf-8') + c = int(ref[1:], 16) if ref[0] == "x" else int(ref) + text = chr(c).encode("utf-8") self.elementstack[-1][2].append(text) - def handle_entityref(self, ref): + def handle_entityref(self, ref) -> None: # called for each entity reference, e.g. for '©', ref will be 'copy' - if not self.elementstack: return - if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) - if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): - text = '&%s;' % ref + if not self.elementstack: + return + if _debug: + sys.stderr.write(f"entering handle_entityref with {ref}\n") + if ref in ("lt", "gt", "quot", "amp", "apos"): + text = f"&{ref};" else: # entity resolution graciously donated by Aaron Swartz def name2cp(k): import htmlentitydefs - if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 + + if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3 return htmlentitydefs.name2codepoint[k] k = htmlentitydefs.entitydefs[k] - if k.startswith('&#') and k.endswith(';'): - return int(k[2:-1]) # not in latin-1 + if k.startswith("&#") and k.endswith(";"): + return int(k[2:-1]) # not in latin-1 return ord(k) - try: name2cp(ref) - except KeyError: text = '&%s;' % ref - else: text = chr(name2cp(ref)).encode('utf-8') + + try: + name2cp(ref) + except KeyError: + text = f"&{ref};" + else: + text = chr(name2cp(ref)).encode("utf-8") self.elementstack[-1][2].append(text) - def handle_data(self, text, escape=1): + def handle_data(self, text, escape=1) -> None: # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references - if not self.elementstack: return - if escape and self.contentparams.get('type') == 'application/xhtml+xml': + if not self.elementstack: + return + if escape and self.contentparams.get("type") == "application/xhtml+xml": text = _xmlescape(text) self.elementstack[-1][2].append(text) - def handle_comment(self, text): + def handle_comment(self, text) -> None: # called for each comment, e.g. pass - def handle_pi(self, text): + def handle_pi(self, text) -> None: # called for each processing instruction, e.g. pass - def handle_decl(self, text): + def handle_decl(self, text) -> None: pass def parse_declaration(self, i): # override internal declaration handler to handle CDATA blocks - if _debug: sys.stderr.write('entering parse_declaration\n') - if self.rawdata[i:i+9] == '', i) - if k == -1: k = len(self.rawdata) - self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) - return k+3 + if _debug: + sys.stderr.write("entering parse_declaration\n") + if self.rawdata[i : i + 9] == "", i) + if k == -1: + k = len(self.rawdata) + self.handle_data(_xmlescape(self.rawdata[i + 9 : k]), 0) + return k + 3 else: - k = self.rawdata.find('>', i) - return k+1 + k = self.rawdata.find(">", i) + return k + 1 def mapContentType(self, contentType): contentType = contentType.lower() - if contentType == 'text': - contentType = 'text/plain' - elif contentType == 'html': - contentType = 'text/html' - elif contentType == 'xhtml': - contentType = 'application/xhtml+xml' + if contentType == "text": + contentType = "text/plain" + elif contentType == "html": + contentType = "text/html" + elif contentType == "xhtml": + contentType = "application/xhtml+xml" return contentType - def trackNamespace(self, prefix, uri): + def trackNamespace(self, prefix, uri) -> None: loweruri = uri.lower() - if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: - self.version = 'rss090' - if loweruri == 'http://purl.org/rss/1.0/' and not self.version: - self.version = 'rss10' - if loweruri == 'http://www.w3.org/2005/atom' and not self.version: - self.version = 'atom10' - if loweruri.find('backend.userland.com/rss') != -1: + if (prefix, loweruri) == (None, "http://my.netscape.com/rdf/simple/0.9/") and not self.version: + self.version = "rss090" + if loweruri == "http://purl.org/rss/1.0/" and not self.version: + self.version = "rss10" + if loweruri == "http://www.w3.org/2005/atom" and not self.version: + self.version = "atom10" + if loweruri.find("backend.userland.com/rss") != -1: # match any backend.userland.com namespace - uri = 'http://backend.userland.com/rss' + uri = "http://backend.userland.com/rss" loweruri = uri if self._matchnamespaces.has_key(loweruri): self.namespacemap[prefix] = self._matchnamespaces[loweruri] self.namespacesInUse[self._matchnamespaces[loweruri]] = uri else: - self.namespacesInUse[prefix or ''] = uri + self.namespacesInUse[prefix or ""] = uri def resolveURI(self, uri): - return _urljoin(self.baseuri or '', uri) + return _urljoin(self.baseuri or "", uri) def decodeEntities(self, element, data): return data def strattrs(self, attrs): - return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) + return "".join([' {}="{}"'.format(t[0], _xmlescape(t[1], {'"': """})) for t in attrs]) - def push(self, element, expectingText): + def push(self, element, expectingText) -> None: self.elementstack.append([element, expectingText, []]) def pop(self, element, stripWhitespace=1): - if not self.elementstack: return - if self.elementstack[-1][0] != element: return + if not self.elementstack: + return None + if self.elementstack[-1][0] != element: + return None element, expectingText, pieces = self.elementstack.pop() - if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': + if self.version == "atom10" and self.contentparams.get("type", "text") == "application/xhtml+xml": # remove enclosing child element, but only if it is a
and # only if all the remaining content is nested underneath it. # This means that the divs would be retained in the following: #
foo
bar
- if pieces and (pieces[0] == '
' or pieces[0].startswith('
': + if pieces and (pieces[0] == "
" or pieces[0].startswith("
": depth = 0 for piece in pieces[:-1]: - if piece.startswith(''): + if depth == 0: + break + elif piece.startswith("<") and not piece.endswith("/>"): depth += 1 else: pieces = pieces[1:-1] - output = ''.join(pieces) + output = "".join(pieces) if stripWhitespace: output = output.strip() - if not expectingText: return output + if not expectingText: + return output # decode base64 content - if base64 and self.contentparams.get('base64', 0): + if base64 and self.contentparams.get("base64", 0): try: output = base64.decodestring(output) except binascii.Error: @@ -683,90 +1019,85 @@ def pop(self, element, stripWhitespace=1): output = self.resolveURI(output) # decode entities within embedded markup - if not self.contentparams.get('base64', 0): + if not self.contentparams.get("base64", 0): output = self.decodeEntities(element, output) # remove temporary cruft from contentparams - try: - del self.contentparams['mode'] - except KeyError: - pass - try: - del self.contentparams['base64'] - except KeyError: - pass + with contextlib.suppress(KeyError): + del self.contentparams["mode"] + with contextlib.suppress(KeyError): + del self.contentparams["base64"] # resolve relative URIs within embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: + if self.mapContentType(self.contentparams.get("type", "text/html")) in self.html_types: if element in self.can_contain_relative_uris: output = _resolveRelativeURIs(output, self.baseuri, self.encoding) # sanitize embedded markup - if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: + if self.mapContentType(self.contentparams.get("type", "text/html")) in self.html_types: if element in self.can_contain_dangerous_markup: output = _sanitizeHTML(output, self.encoding) - if self.encoding and type(output) != type(''): - try: + if self.encoding and type(output) != str: + with contextlib.suppress(Exception): output = str(output, self.encoding) - except: - pass # address common error where people take data that is already # utf-8, presume that it is iso-8859-1, and re-encode it. - if self.encoding=='utf-8' and type(output) == type(''): - try: - output = str(output.encode('iso-8859-1'), 'utf-8') - except: - pass + if self.encoding == "utf-8" and type(output) == str: + with contextlib.suppress(Exception): + output = str(output.encode("iso-8859-1"), "utf-8") # map win-1252 extensions to the proper code points - if type(output) == type(''): - output = ''.join([c in cp1252 and cp1252[c] or c for c in output]) + if type(output) == str: + output = "".join([c in cp1252 and cp1252[c] or c for c in output]) # categories/tags/keywords/whatever are handled in _end_category - if element == 'category': + if element == "category": return output # store output in appropriate place(s) if self.inentry and not self.insource: - if element == 'content': + if element == "content": self.entries[-1].setdefault(element, []) contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output + contentparams["value"] = output self.entries[-1][element].append(contentparams) - elif element == 'link': + elif element == "link": self.entries[-1][element] = output if output: - self.entries[-1]['links'][-1]['href'] = output + self.entries[-1]["links"][-1]["href"] = output else: - if element == 'description': - element = 'summary' + if element == "description": + element = "summary" self.entries[-1][element] = output if self.incontent: contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - self.entries[-1][element + '_detail'] = contentparams + contentparams["value"] = output + self.entries[-1][element + "_detail"] = contentparams elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): context = self._getContext() - if element == 'description': - element = 'subtitle' + if element == "description": + element = "subtitle" context[element] = output - if element == 'link': - context['links'][-1]['href'] = output + if element == "link": + context["links"][-1]["href"] = output elif self.incontent: contentparams = copy.deepcopy(self.contentparams) - contentparams['value'] = output - context[element + '_detail'] = contentparams + contentparams["value"] = output + context[element + "_detail"] = contentparams return output - def pushContent(self, tag, attrsD, defaultContentType, expectingText): + def pushContent(self, tag, attrsD, defaultContentType, expectingText) -> None: self.incontent += 1 - self.contentparams = FeedParserDict({ - 'type': self.mapContentType(attrsD.get('type', defaultContentType)), - 'language': self.lang, - 'base': self.baseuri}) - self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) + self.contentparams = FeedParserDict( + { + "type": self.mapContentType(attrsD.get("type", defaultContentType)), + "language": self.lang, + "base": self.baseuri, + } + ) + self.contentparams["base64"] = self._isBase64(attrsD, self.contentparams) self.push(tag, expectingText) def popContent(self, tag): @@ -776,241 +1107,244 @@ def popContent(self, tag): return value def _mapToStandardPrefix(self, name): - colonpos = name.find(':') + colonpos = name.find(":") if colonpos != -1: prefix = name[:colonpos] - suffix = name[colonpos+1:] + suffix = name[colonpos + 1 :] prefix = self.namespacemap.get(prefix, prefix) - name = prefix + ':' + suffix + name = prefix + ":" + suffix return name def _getAttribute(self, attrsD, name): return attrsD.get(self._mapToStandardPrefix(name)) - def _isBase64(self, attrsD, contentparams): - if attrsD.get('mode', '') == 'base64': + def _isBase64(self, attrsD, contentparams) -> int: + if attrsD.get("mode", "") == "base64": return 1 - if self.contentparams['type'].startswith('text/'): + if self.contentparams["type"].startswith("text/"): return 0 - if self.contentparams['type'].endswith('+xml'): + if self.contentparams["type"].endswith("+xml"): return 0 - if self.contentparams['type'].endswith('/xml'): + if self.contentparams["type"].endswith("/xml"): return 0 return 1 def _itsAnHrefDamnIt(self, attrsD): - href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) + href = attrsD.get("url", attrsD.get("uri", attrsD.get("href", None))) if href: - try: - del attrsD['url'] - except KeyError: - pass - try: - del attrsD['uri'] - except KeyError: - pass - attrsD['href'] = href + with contextlib.suppress(KeyError): + del attrsD["url"] + with contextlib.suppress(KeyError): + del attrsD["uri"] + attrsD["href"] = href return attrsD - def _save(self, key, value): + def _save(self, key, value) -> None: context = self._getContext() context.setdefault(key, value) - def _start_rss(self, attrsD): - versionmap = {'0.91': 'rss091u', - '0.92': 'rss092', - '0.93': 'rss093', - '0.94': 'rss094'} + def _start_rss(self, attrsD) -> None: + versionmap = {"0.91": "rss091u", "0.92": "rss092", "0.93": "rss093", "0.94": "rss094"} if not self.version: - attr_version = attrsD.get('version', '') + attr_version = attrsD.get("version", "") version = versionmap.get(attr_version) if version: self.version = version - elif attr_version.startswith('2.'): - self.version = 'rss20' + elif attr_version.startswith("2."): + self.version = "rss20" else: - self.version = 'rss' + self.version = "rss" - def _start_dlhottitles(self, attrsD): - self.version = 'hotrss' + def _start_dlhottitles(self, attrsD) -> None: + self.version = "hotrss" - def _start_channel(self, attrsD): + def _start_channel(self, attrsD) -> None: self.infeed = 1 self._cdf_common(attrsD) + _start_feedinfo = _start_channel - def _cdf_common(self, attrsD): - if attrsD.has_key('lastmod'): + def _cdf_common(self, attrsD) -> None: + if attrsD.has_key("lastmod"): self._start_modified({}) - self.elementstack[-1][-1] = attrsD['lastmod'] + self.elementstack[-1][-1] = attrsD["lastmod"] self._end_modified() - if attrsD.has_key('href'): + if attrsD.has_key("href"): self._start_link({}) - self.elementstack[-1][-1] = attrsD['href'] + self.elementstack[-1][-1] = attrsD["href"] self._end_link() - def _start_feed(self, attrsD): + def _start_feed(self, attrsD) -> None: self.infeed = 1 - versionmap = {'0.1': 'atom01', - '0.2': 'atom02', - '0.3': 'atom03'} + versionmap = {"0.1": "atom01", "0.2": "atom02", "0.3": "atom03"} if not self.version: - attr_version = attrsD.get('version') + attr_version = attrsD.get("version") version = versionmap.get(attr_version) if version: self.version = version else: - self.version = 'atom' + self.version = "atom" - def _end_channel(self): + def _end_channel(self) -> None: self.infeed = 0 + _end_feed = _end_channel - def _start_image(self, attrsD): + def _start_image(self, attrsD) -> None: self.inimage = 1 - self.push('image', 0) + self.push("image", 0) context = self._getContext() - context.setdefault('image', FeedParserDict()) + context.setdefault("image", FeedParserDict()) - def _end_image(self): - self.pop('image') + def _end_image(self) -> None: + self.pop("image") self.inimage = 0 - def _start_textinput(self, attrsD): + def _start_textinput(self, attrsD) -> None: self.intextinput = 1 - self.push('textinput', 0) + self.push("textinput", 0) context = self._getContext() - context.setdefault('textinput', FeedParserDict()) + context.setdefault("textinput", FeedParserDict()) + _start_textInput = _start_textinput - def _end_textinput(self): - self.pop('textinput') + def _end_textinput(self) -> None: + self.pop("textinput") self.intextinput = 0 + _end_textInput = _end_textinput - def _start_author(self, attrsD): + def _start_author(self, attrsD) -> None: self.inauthor = 1 - self.push('author', 1) + self.push("author", 1) + _start_managingeditor = _start_author _start_dc_author = _start_author _start_dc_creator = _start_author _start_itunes_author = _start_author - def _end_author(self): - self.pop('author') + def _end_author(self) -> None: + self.pop("author") self.inauthor = 0 self._sync_author_detail() + _end_managingeditor = _end_author _end_dc_author = _end_author _end_dc_creator = _end_author _end_itunes_author = _end_author - def _start_itunes_owner(self, attrsD): + def _start_itunes_owner(self, attrsD) -> None: self.inpublisher = 1 - self.push('publisher', 0) + self.push("publisher", 0) - def _end_itunes_owner(self): - self.pop('publisher') + def _end_itunes_owner(self) -> None: + self.pop("publisher") self.inpublisher = 0 - self._sync_author_detail('publisher') + self._sync_author_detail("publisher") - def _start_contributor(self, attrsD): + def _start_contributor(self, attrsD) -> None: self.incontributor = 1 context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('contributor', 0) + context.setdefault("contributors", []) + context["contributors"].append(FeedParserDict()) + self.push("contributor", 0) - def _end_contributor(self): - self.pop('contributor') + def _end_contributor(self) -> None: + self.pop("contributor") self.incontributor = 0 - def _start_dc_contributor(self, attrsD): + def _start_dc_contributor(self, attrsD) -> None: self.incontributor = 1 context = self._getContext() - context.setdefault('contributors', []) - context['contributors'].append(FeedParserDict()) - self.push('name', 0) + context.setdefault("contributors", []) + context["contributors"].append(FeedParserDict()) + self.push("name", 0) - def _end_dc_contributor(self): + def _end_dc_contributor(self) -> None: self._end_name() self.incontributor = 0 - def _start_name(self, attrsD): - self.push('name', 0) + def _start_name(self, attrsD) -> None: + self.push("name", 0) + _start_itunes_name = _start_name - def _end_name(self): - value = self.pop('name') + def _end_name(self) -> None: + value = self.pop("name") if self.inpublisher: - self._save_author('name', value, 'publisher') + self._save_author("name", value, "publisher") elif self.inauthor: - self._save_author('name', value) + self._save_author("name", value) elif self.incontributor: - self._save_contributor('name', value) + self._save_contributor("name", value) elif self.intextinput: context = self._getContext() - context['textinput']['name'] = value + context["textinput"]["name"] = value + _end_itunes_name = _end_name - def _start_width(self, attrsD): - self.push('width', 0) + def _start_width(self, attrsD) -> None: + self.push("width", 0) - def _end_width(self): - value = self.pop('width') + def _end_width(self) -> None: + value = self.pop("width") try: value = int(value) except: value = 0 if self.inimage: context = self._getContext() - context['image']['width'] = value + context["image"]["width"] = value - def _start_height(self, attrsD): - self.push('height', 0) + def _start_height(self, attrsD) -> None: + self.push("height", 0) - def _end_height(self): - value = self.pop('height') + def _end_height(self) -> None: + value = self.pop("height") try: value = int(value) except: value = 0 if self.inimage: context = self._getContext() - context['image']['height'] = value + context["image"]["height"] = value + + def _start_url(self, attrsD) -> None: + self.push("href", 1) - def _start_url(self, attrsD): - self.push('href', 1) _start_homepage = _start_url _start_uri = _start_url - def _end_url(self): - value = self.pop('href') + def _end_url(self) -> None: + value = self.pop("href") if self.inauthor: - self._save_author('href', value) + self._save_author("href", value) elif self.incontributor: - self._save_contributor('href', value) + self._save_contributor("href", value) elif self.inimage: context = self._getContext() - context['image']['href'] = value + context["image"]["href"] = value elif self.intextinput: context = self._getContext() - context['textinput']['link'] = value + context["textinput"]["link"] = value + _end_homepage = _end_url _end_uri = _end_url - def _start_email(self, attrsD): - self.push('email', 0) + def _start_email(self, attrsD) -> None: + self.push("email", 0) + _start_itunes_email = _start_email - def _end_email(self): - value = self.pop('email') + def _end_email(self) -> None: + value = self.pop("email") if self.inpublisher: - self._save_author('email', value, 'publisher') + self._save_author("email", value, "publisher") elif self.inauthor: - self._save_author('email', value) + self._save_author("email", value) elif self.incontributor: - self._save_contributor('email', value) + self._save_contributor("email", value) + _end_itunes_email = _end_email def _getContext(self): @@ -1022,430 +1356,479 @@ def _getContext(self): context = self.feeddata return context - def _save_author(self, key, value, prefix='author'): + def _save_author(self, key, value, prefix="author") -> None: context = self._getContext() - context.setdefault(prefix + '_detail', FeedParserDict()) - context[prefix + '_detail'][key] = value + context.setdefault(prefix + "_detail", FeedParserDict()) + context[prefix + "_detail"][key] = value self._sync_author_detail() - def _save_contributor(self, key, value): + def _save_contributor(self, key, value) -> None: context = self._getContext() - context.setdefault('contributors', [FeedParserDict()]) - context['contributors'][-1][key] = value + context.setdefault("contributors", [FeedParserDict()]) + context["contributors"][-1][key] = value - def _sync_author_detail(self, key='author'): + def _sync_author_detail(self, key="author") -> None: context = self._getContext() - detail = context.get('%s_detail' % key) + detail = context.get(f"{key}_detail") if detail: - name = detail.get('name') - email = detail.get('email') + name = detail.get("name") + email = detail.get("email") if name and email: - context[key] = '%s (%s)' % (name, email) + context[key] = f"{name} ({email})" elif name: context[key] = name elif email: context[key] = email else: author = context.get(key) - if not author: return - emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) - if not emailmatch: return + if not author: + return + emailmatch = re.search( + r"""(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))""", + author, + ) + if not emailmatch: + return email = emailmatch.group(0) # probably a better way to do the following, but it passes all the tests - author = author.replace(email, '') - author = author.replace('()', '') + author = author.replace(email, "") + author = author.replace("()", "") author = author.strip() - if author and (author[0] == '('): + if author and (author[0] == "("): author = author[1:] - if author and (author[-1] == ')'): + if author and (author[-1] == ")"): author = author[:-1] author = author.strip() - context.setdefault('%s_detail' % key, FeedParserDict()) - context['%s_detail' % key]['name'] = author - context['%s_detail' % key]['email'] = email + context.setdefault(f"{key}_detail", FeedParserDict()) + context[f"{key}_detail"]["name"] = author + context[f"{key}_detail"]["email"] = email + + def _start_subtitle(self, attrsD) -> None: + self.pushContent("subtitle", attrsD, "text/plain", 1) - def _start_subtitle(self, attrsD): - self.pushContent('subtitle', attrsD, 'text/plain', 1) _start_tagline = _start_subtitle _start_itunes_subtitle = _start_subtitle - def _end_subtitle(self): - self.popContent('subtitle') + def _end_subtitle(self) -> None: + self.popContent("subtitle") + _end_tagline = _end_subtitle _end_itunes_subtitle = _end_subtitle - def _start_rights(self, attrsD): - self.pushContent('rights', attrsD, 'text/plain', 1) + def _start_rights(self, attrsD) -> None: + self.pushContent("rights", attrsD, "text/plain", 1) + _start_dc_rights = _start_rights _start_copyright = _start_rights - def _end_rights(self): - self.popContent('rights') + def _end_rights(self) -> None: + self.popContent("rights") + _end_dc_rights = _end_rights _end_copyright = _end_rights - def _start_item(self, attrsD): + def _start_item(self, attrsD) -> None: self.entries.append(FeedParserDict()) - self.push('item', 0) + self.push("item", 0) self.inentry = 1 self.guidislink = 0 - id = self._getAttribute(attrsD, 'rdf:about') + id = self._getAttribute(attrsD, "rdf:about") if id: context = self._getContext() - context['id'] = id + context["id"] = id self._cdf_common(attrsD) + _start_entry = _start_item _start_product = _start_item - def _end_item(self): - self.pop('item') + def _end_item(self) -> None: + self.pop("item") self.inentry = 0 + _end_entry = _end_item - def _start_dc_language(self, attrsD): - self.push('language', 1) + def _start_dc_language(self, attrsD) -> None: + self.push("language", 1) + _start_language = _start_dc_language - def _end_dc_language(self): - self.lang = self.pop('language') + def _end_dc_language(self) -> None: + self.lang = self.pop("language") + _end_language = _end_dc_language - def _start_dc_publisher(self, attrsD): - self.push('publisher', 1) + def _start_dc_publisher(self, attrsD) -> None: + self.push("publisher", 1) + _start_webmaster = _start_dc_publisher - def _end_dc_publisher(self): - self.pop('publisher') - self._sync_author_detail('publisher') + def _end_dc_publisher(self) -> None: + self.pop("publisher") + self._sync_author_detail("publisher") + _end_webmaster = _end_dc_publisher - def _start_published(self, attrsD): - self.push('published', 1) + def _start_published(self, attrsD) -> None: + self.push("published", 1) + _start_dcterms_issued = _start_published _start_issued = _start_published - def _end_published(self): - value = self.pop('published') - self._save('published_parsed', _parse_date(value)) + def _end_published(self) -> None: + value = self.pop("published") + self._save("published_parsed", _parse_date(value)) + _end_dcterms_issued = _end_published _end_issued = _end_published - def _start_updated(self, attrsD): - self.push('updated', 1) + def _start_updated(self, attrsD) -> None: + self.push("updated", 1) + _start_modified = _start_updated _start_dcterms_modified = _start_updated _start_pubdate = _start_updated _start_dc_date = _start_updated - def _end_updated(self): - value = self.pop('updated') + def _end_updated(self) -> None: + value = self.pop("updated") parsed_value = _parse_date(value) - self._save('updated_parsed', parsed_value) + self._save("updated_parsed", parsed_value) + _end_modified = _end_updated _end_dcterms_modified = _end_updated _end_pubdate = _end_updated _end_dc_date = _end_updated - def _start_created(self, attrsD): - self.push('created', 1) + def _start_created(self, attrsD) -> None: + self.push("created", 1) + _start_dcterms_created = _start_created - def _end_created(self): - value = self.pop('created') - self._save('created_parsed', _parse_date(value)) + def _end_created(self) -> None: + value = self.pop("created") + self._save("created_parsed", _parse_date(value)) + _end_dcterms_created = _end_created - def _start_expirationdate(self, attrsD): - self.push('expired', 1) + def _start_expirationdate(self, attrsD) -> None: + self.push("expired", 1) - def _end_expirationdate(self): - self._save('expired_parsed', _parse_date(self.pop('expired'))) + def _end_expirationdate(self) -> None: + self._save("expired_parsed", _parse_date(self.pop("expired"))) - def _start_cc_license(self, attrsD): - self.push('license', 1) - value = self._getAttribute(attrsD, 'rdf:resource') + def _start_cc_license(self, attrsD) -> None: + self.push("license", 1) + value = self._getAttribute(attrsD, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('license') + self.pop("license") - def _start_creativecommons_license(self, attrsD): - self.push('license', 1) + def _start_creativecommons_license(self, attrsD) -> None: + self.push("license", 1) - def _end_creativecommons_license(self): - self.pop('license') + def _end_creativecommons_license(self) -> None: + self.pop("license") - def _addTag(self, term, scheme, label): + def _addTag(self, term, scheme, label) -> None: context = self._getContext() - tags = context.setdefault('tags', []) - if (not term) and (not scheme) and (not label): return - value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) + tags = context.setdefault("tags", []) + if (not term) and (not scheme) and (not label): + return + value = FeedParserDict({"term": term, "scheme": scheme, "label": label}) if value not in tags: - tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) + tags.append(FeedParserDict({"term": term, "scheme": scheme, "label": label})) - def _start_category(self, attrsD): - if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) - term = attrsD.get('term') - scheme = attrsD.get('scheme', attrsD.get('domain')) - label = attrsD.get('label') + def _start_category(self, attrsD) -> None: + if _debug: + sys.stderr.write(f"entering _start_category with {attrsD!r}\n") + term = attrsD.get("term") + scheme = attrsD.get("scheme", attrsD.get("domain")) + label = attrsD.get("label") self._addTag(term, scheme, label) - self.push('category', 1) + self.push("category", 1) + _start_dc_subject = _start_category _start_keywords = _start_category - def _end_itunes_keywords(self): - for term in self.pop('itunes_keywords').split(): - self._addTag(term, 'http://www.itunes.com/', None) + def _end_itunes_keywords(self) -> None: + for term in self.pop("itunes_keywords").split(): + self._addTag(term, "http://www.itunes.com/", None) - def _start_itunes_category(self, attrsD): - self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) - self.push('category', 1) + def _start_itunes_category(self, attrsD) -> None: + self._addTag(attrsD.get("text"), "http://www.itunes.com/", None) + self.push("category", 1) - def _end_category(self): - value = self.pop('category') - if not value: return + def _end_category(self) -> None: + value = self.pop("category") + if not value: + return context = self._getContext() - tags = context['tags'] - if value and len(tags) and not tags[-1]['term']: - tags[-1]['term'] = value + tags = context["tags"] + if value and len(tags) and not tags[-1]["term"]: + tags[-1]["term"] = value else: self._addTag(value, None, None) + _end_dc_subject = _end_category _end_keywords = _end_category _end_itunes_category = _end_category - def _start_cloud(self, attrsD): - self._getContext()['cloud'] = FeedParserDict(attrsD) + def _start_cloud(self, attrsD) -> None: + self._getContext()["cloud"] = FeedParserDict(attrsD) - def _start_link(self, attrsD): - attrsD.setdefault('rel', 'alternate') - attrsD.setdefault('type', 'text/html') + def _start_link(self, attrsD) -> None: + attrsD.setdefault("rel", "alternate") + attrsD.setdefault("type", "text/html") attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) + if attrsD.has_key("href"): + attrsD["href"] = self.resolveURI(attrsD["href"]) expectingText = self.infeed or self.inentry or self.insource context = self._getContext() - context.setdefault('links', []) - context['links'].append(FeedParserDict(attrsD)) - if attrsD['rel'] == 'enclosure': + context.setdefault("links", []) + context["links"].append(FeedParserDict(attrsD)) + if attrsD["rel"] == "enclosure": self._start_enclosure(attrsD) - if attrsD.has_key('href'): + if attrsD.has_key("href"): expectingText = 0 - if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): - context['link'] = attrsD['href'] + if (attrsD.get("rel") == "alternate") and (self.mapContentType(attrsD.get("type")) in self.html_types): + context["link"] = attrsD["href"] else: - self.push('link', expectingText) + self.push("link", expectingText) + _start_producturl = _start_link - def _end_link(self): - value = self.pop('link') + def _end_link(self) -> None: + value = self.pop("link") context = self._getContext() if self.intextinput: - context['textinput']['link'] = value + context["textinput"]["link"] = value if self.inimage: - context['image']['link'] = value + context["image"]["link"] = value + _end_producturl = _end_link - def _start_guid(self, attrsD): - self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') - self.push('id', 1) + def _start_guid(self, attrsD) -> None: + self.guidislink = attrsD.get("ispermalink", "true") == "true" + self.push("id", 1) - def _end_guid(self): - value = self.pop('id') - self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) + def _end_guid(self) -> None: + value = self.pop("id") + self._save("guidislink", self.guidislink and not self._getContext().has_key("link")) if self.guidislink: # guid acts as link, but only if 'ispermalink' is not present or is 'true', # and only if the item doesn't already have a link element - self._save('link', value) + self._save("link", value) + + def _start_title(self, attrsD) -> None: + self.pushContent("title", attrsD, "text/plain", self.infeed or self.inentry or self.insource) - def _start_title(self, attrsD): - self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) - def _start_title_low_pri(self, attrsD): - if not self._getContext().has_key('title'): + def _start_title_low_pri(self, attrsD) -> None: + if not self._getContext().has_key("title"): self._start_title(attrsD) + _start_dc_title = _start_title_low_pri _start_media_title = _start_title_low_pri - def _end_title(self): - value = self.popContent('title') + def _end_title(self) -> None: + value = self.popContent("title") context = self._getContext() if self.intextinput: - context['textinput']['title'] = value + context["textinput"]["title"] = value elif self.inimage: - context['image']['title'] = value - def _end_title_low_pri(self): - if not self._getContext().has_key('title'): + context["image"]["title"] = value + + def _end_title_low_pri(self) -> None: + if not self._getContext().has_key("title"): self._end_title() + _end_dc_title = _end_title_low_pri _end_media_title = _end_title_low_pri - def _start_description(self, attrsD): + def _start_description(self, attrsD) -> None: context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' + if context.has_key("summary"): + self._summaryKey = "content" self._start_content(attrsD) else: - self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) + self.pushContent("description", attrsD, "text/html", self.infeed or self.inentry or self.insource) - def _start_abstract(self, attrsD): - self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) + def _start_abstract(self, attrsD) -> None: + self.pushContent("description", attrsD, "text/plain", self.infeed or self.inentry or self.insource) - def _end_description(self): - if self._summaryKey == 'content': + def _end_description(self) -> None: + if self._summaryKey == "content": self._end_content() else: - value = self.popContent('description') + value = self.popContent("description") context = self._getContext() if self.intextinput: - context['textinput']['description'] = value + context["textinput"]["description"] = value elif self.inimage: - context['image']['description'] = value + context["image"]["description"] = value self._summaryKey = None + _end_abstract = _end_description - def _start_info(self, attrsD): - self.pushContent('info', attrsD, 'text/plain', 1) + def _start_info(self, attrsD) -> None: + self.pushContent("info", attrsD, "text/plain", 1) + _start_feedburner_browserfriendly = _start_info - def _end_info(self): - self.popContent('info') + def _end_info(self) -> None: + self.popContent("info") + _end_feedburner_browserfriendly = _end_info - def _start_generator(self, attrsD): + def _start_generator(self, attrsD) -> None: if attrsD: attrsD = self._itsAnHrefDamnIt(attrsD) - if attrsD.has_key('href'): - attrsD['href'] = self.resolveURI(attrsD['href']) - self._getContext()['generator_detail'] = FeedParserDict(attrsD) - self.push('generator', 1) + if attrsD.has_key("href"): + attrsD["href"] = self.resolveURI(attrsD["href"]) + self._getContext()["generator_detail"] = FeedParserDict(attrsD) + self.push("generator", 1) - def _end_generator(self): - value = self.pop('generator') + def _end_generator(self) -> None: + value = self.pop("generator") context = self._getContext() - if context.has_key('generator_detail'): - context['generator_detail']['name'] = value + if context.has_key("generator_detail"): + context["generator_detail"]["name"] = value - def _start_admin_generatoragent(self, attrsD): - self.push('generator', 1) - value = self._getAttribute(attrsD, 'rdf:resource') + def _start_admin_generatoragent(self, attrsD) -> None: + self.push("generator", 1) + value = self._getAttribute(attrsD, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('generator') - self._getContext()['generator_detail'] = FeedParserDict({'href': value}) + self.pop("generator") + self._getContext()["generator_detail"] = FeedParserDict({"href": value}) - def _start_admin_errorreportsto(self, attrsD): - self.push('errorreportsto', 1) - value = self._getAttribute(attrsD, 'rdf:resource') + def _start_admin_errorreportsto(self, attrsD) -> None: + self.push("errorreportsto", 1) + value = self._getAttribute(attrsD, "rdf:resource") if value: self.elementstack[-1][2].append(value) - self.pop('errorreportsto') + self.pop("errorreportsto") - def _start_summary(self, attrsD): + def _start_summary(self, attrsD) -> None: context = self._getContext() - if context.has_key('summary'): - self._summaryKey = 'content' + if context.has_key("summary"): + self._summaryKey = "content" self._start_content(attrsD) else: - self._summaryKey = 'summary' - self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) + self._summaryKey = "summary" + self.pushContent(self._summaryKey, attrsD, "text/plain", 1) + _start_itunes_summary = _start_summary - def _end_summary(self): - if self._summaryKey == 'content': + def _end_summary(self) -> None: + if self._summaryKey == "content": self._end_content() else: - self.popContent(self._summaryKey or 'summary') + self.popContent(self._summaryKey or "summary") self._summaryKey = None + _end_itunes_summary = _end_summary - def _start_enclosure(self, attrsD): + def _start_enclosure(self, attrsD) -> None: attrsD = self._itsAnHrefDamnIt(attrsD) - self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) - href = attrsD.get('href') + self._getContext().setdefault("enclosures", []).append(FeedParserDict(attrsD)) + href = attrsD.get("href") if href: context = self._getContext() - if not context.get('id'): - context['id'] = href + if not context.get("id"): + context["id"] = href - def _start_source(self, attrsD): + def _start_source(self, attrsD) -> None: self.insource = 1 - def _end_source(self): + def _end_source(self) -> None: self.insource = 0 - self._getContext()['source'] = copy.deepcopy(self.sourcedata) + self._getContext()["source"] = copy.deepcopy(self.sourcedata) self.sourcedata.clear() - def _start_content(self, attrsD): - self.pushContent('content', attrsD, 'text/plain', 1) - src = attrsD.get('src') + def _start_content(self, attrsD) -> None: + self.pushContent("content", attrsD, "text/plain", 1) + src = attrsD.get("src") if src: - self.contentparams['src'] = src - self.push('content', 1) + self.contentparams["src"] = src + self.push("content", 1) + + def _start_prodlink(self, attrsD) -> None: + self.pushContent("content", attrsD, "text/html", 1) - def _start_prodlink(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) + def _start_body(self, attrsD) -> None: + self.pushContent("content", attrsD, "application/xhtml+xml", 1) - def _start_body(self, attrsD): - self.pushContent('content', attrsD, 'application/xhtml+xml', 1) _start_xhtml_body = _start_body - def _start_content_encoded(self, attrsD): - self.pushContent('content', attrsD, 'text/html', 1) + def _start_content_encoded(self, attrsD) -> None: + self.pushContent("content", attrsD, "text/html", 1) + _start_fullitem = _start_content_encoded - def _end_content(self): - copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) - value = self.popContent('content') + def _end_content(self) -> None: + copyToDescription = self.mapContentType(self.contentparams.get("type")) in (["text/plain", *self.html_types]) + value = self.popContent("content") if copyToDescription: - self._save('description', value) + self._save("description", value) + _end_body = _end_content _end_xhtml_body = _end_content _end_content_encoded = _end_content _end_fullitem = _end_content _end_prodlink = _end_content - def _start_itunes_image(self, attrsD): - self.push('itunes_image', 0) - self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) + def _start_itunes_image(self, attrsD) -> None: + self.push("itunes_image", 0) + self._getContext()["image"] = FeedParserDict({"href": attrsD.get("href")}) + _start_itunes_link = _start_itunes_image - def _end_itunes_block(self): - value = self.pop('itunes_block', 0) - self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 + def _end_itunes_block(self) -> None: + value = self.pop("itunes_block", 0) + self._getContext()["itunes_block"] = (value == "yes") and 1 or 0 + + def _end_itunes_explicit(self) -> None: + value = self.pop("itunes_explicit", 0) + self._getContext()["itunes_explicit"] = (value == "yes") and 1 or 0 - def _end_itunes_explicit(self): - value = self.pop('itunes_explicit', 0) - self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 if _XML_AVAILABLE: + class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): - def __init__(self, baseuri, baselang, encoding): - if _debug: sys.stderr.write('trying StrictFeedParser\n') + def __init__(self, baseuri, baselang, encoding) -> None: + if _debug: + sys.stderr.write("trying StrictFeedParser\n") xml.sax.handler.ContentHandler.__init__(self) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) self.bozo = 0 self.exc = None - def startPrefixMapping(self, prefix, uri): + def startPrefixMapping(self, prefix, uri) -> None: self.trackNamespace(prefix, uri) - def startElementNS(self, name, qname, attrs): + def startElementNS(self, name, qname, attrs) -> None: namespace, localname = name - lowernamespace = str(namespace or '').lower() - if lowernamespace.find('backend.userland.com/rss') != -1: + lowernamespace = str(namespace or "").lower() + if lowernamespace.find("backend.userland.com/rss") != -1: # match any backend.userland.com namespace - namespace = 'http://backend.userland.com/rss' + namespace = "http://backend.userland.com/rss" lowernamespace = namespace - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] - else: - givenprefix = None + givenprefix = qname.split(":")[0] if qname and qname.find(":") > 0 else None prefix = self._matchnamespaces.get(lowernamespace, givenprefix) - if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): - raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) + if ( + givenprefix + and (prefix is None or (prefix == "" and lowernamespace == "")) + and not self.namespacesInUse.has_key(givenprefix) + ): + msg = f"'{givenprefix}' is not associated with a namespace" + raise UndeclaredNamespace(msg) if prefix: - localname = prefix + ':' + localname + localname = prefix + ":" + localname localname = str(localname).lower() - if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) + if _debug: + sys.stderr.write( + f"startElementNS: qname = {qname}, namespace = {namespace}, givenprefix = {givenprefix}, prefix = {prefix}, attrs = {attrs.items()}, localname = {localname}\n" + ) # qname implementation is horribly broken in Python 2.1 (it # doesn't report any), and slightly broken in Python 2.2 (it @@ -1456,70 +1839,81 @@ def startElementNS(self, name, qname, attrs): # tirelessly telling me that it didn't work yet. attrsD = {} for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): - lowernamespace = (namespace or '').lower() - prefix = self._matchnamespaces.get(lowernamespace, '') + lowernamespace = (namespace or "").lower() + prefix = self._matchnamespaces.get(lowernamespace, "") if prefix: - attrlocalname = prefix + ':' + attrlocalname + attrlocalname = prefix + ":" + attrlocalname attrsD[str(attrlocalname).lower()] = attrvalue for qname in attrs.getQNames(): attrsD[str(qname).lower()] = attrs.getValueByQName(qname) self.unknown_starttag(localname, attrsD.items()) - def characters(self, text): + def characters(self, text) -> None: self.handle_data(text) - def endElementNS(self, name, qname): + def endElementNS(self, name, qname) -> None: namespace, localname = name - lowernamespace = str(namespace or '').lower() - if qname and qname.find(':') > 0: - givenprefix = qname.split(':')[0] - else: - givenprefix = '' + lowernamespace = str(namespace or "").lower() + givenprefix = qname.split(":")[0] if qname and qname.find(":") > 0 else "" prefix = self._matchnamespaces.get(lowernamespace, givenprefix) if prefix: - localname = prefix + ':' + localname + localname = prefix + ":" + localname localname = str(localname).lower() self.unknown_endtag(localname) - def error(self, exc): + def error(self, exc) -> None: self.bozo = 1 self.exc = exc - def fatalError(self, exc): + def fatalError(self, exc) -> Never: self.error(exc) raise exc class _BaseHTMLProcessor(HTMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - - tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') - charref = re.compile('&#(\d+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') - special = re.compile(' None: self.encoding = encoding - if _debug: sys.stderr.write(f'entering BaseHTMLProcessor, encoding={self.encoding}\n') + if _debug: + sys.stderr.write(f"entering BaseHTMLProcessor, encoding={self.encoding}\n") super().__init__(convert_charrefs=False) self.reset() - def reset(self): + def reset(self) -> None: self.pieces = [] super().reset() def _shorttag_replace(self, match): tag = match.group(1) if tag in self.elements_no_end_tag: - return '<' + tag + ' />' + return "<" + tag + " />" else: - return '<' + tag + '>' + return "<" + tag + ">" - def feed(self, data): - data = re.compile(r'', self._shorttag_replace, data) - data = data.replace(''', "'") - data = data.replace('"', '"') + def feed(self, data) -> None: + data = re.compile(r"", self._shorttag_replace, data) + data = data.replace("'", "'") + data = data.replace(""", '"') if self.encoding and isinstance(data, str): data = data.encode(self.encoding) super().feed(data) @@ -1527,70 +1921,72 @@ def feed(self, data): def normalize_attrs(self, attrs): # utility method to be called by descendants attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] - return attrs + return [(k, k in ("rel", "type") and v.lower() or v) for k, v in attrs] - def unknown_starttag(self, tag, attrs): + def unknown_starttag(self, tag, attrs) -> None: # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
+        if _debug:
+            sys.stderr.write(f"_BaseHTMLProcessor, unknown_starttag, tag={tag}\n")
         uattrs = []
         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
         for key, value in attrs:
-            if type(value) != type(''):
+            if type(value) != str:
                 value = str(value, self.encoding)
             uattrs.append((str(key, self.encoding), value))
-        strattrs = ''.join([' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
+        strattrs = "".join([f' {key}="{value}"' for key, value in uattrs]).encode(self.encoding)
         if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+            self.pieces.append("<{tag}{strattrs} />".format(**locals()))
         else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+            self.pieces.append("<{tag}{strattrs}>".format(**locals()))
 
-    def unknown_endtag(self, tag):
+    def unknown_endtag(self, tag) -> None:
         # called for each end tag, e.g. for 
, tag will be 'pre' # Reconstruct the original end tag. if tag not in self.elements_no_end_tag: - self.pieces.append("" % locals()) + self.pieces.append("".format(**locals())) - def handle_charref(self, ref): + def handle_charref(self, ref) -> None: # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. - self.pieces.append('&#%(ref)s;' % locals()) + self.pieces.append("&#{ref};".format(**locals())) - def handle_entityref(self, ref): + def handle_entityref(self, ref) -> None: # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. if ref in html.entities.name2codepoint: - self.pieces.append(f'&{ref};') + self.pieces.append(f"&{ref};") else: - self.pieces.append(f'&{ref}') + self.pieces.append(f"&{ref}") - def handle_data(self, text): + def handle_data(self, text) -> None: # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) + if _debug: + sys.stderr.write(f"_BaseHTMLProcessor, handle_text, text={text}\n") self.pieces.append(text) - def handle_comment(self, text): + def handle_comment(self, text) -> None: # called for each HTML comment, e.g. # Reconstruct the original comment. - self.pieces.append('' % locals()) + self.pieces.append("".format(**locals())) - def handle_pi(self, text): + def handle_pi(self, text) -> None: # called for each processing instruction, e.g. # Reconstruct original processing instruction. - self.pieces.append('' % locals()) + self.pieces.append("".format(**locals())) - def handle_decl(self, text): + def handle_decl(self, text) -> None: # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) + self.pieces.append("".format(**locals())) + + _new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) @@ -1605,118 +2001,252 @@ def _scan_name(self, i, declstartpos): return name.lower(), m.end() else: self.handle_data(rawdata) -# self.updatepos(declstartpos, i) + # self.updatepos(declstartpos, i) return None, -1 def output(self): - '''Return processed HTML as a single string''' - return ''.join([str(p) for p in self.pieces]) + """Return processed HTML as a single string.""" + return "".join([str(p) for p in self.pieces]) class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): - def __init__(self, baseuri, baselang, encoding): + def __init__(self, baseuri, baselang, encoding) -> None: super().__init__(encoding) _FeedParserMixin.__init__(self, baseuri, baselang, encoding) def decodeEntities(self, element, data): - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace('"', '"') - data = data.replace(''', ''') - data = data.replace(''', ''') - if 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): - data = data.replace('<', '<') - data = data.replace('>', '>') - data = data.replace('&', '&') - data = data.replace('"', '"') - data = data.replace(''', "'") + data = data.replace("<", "<") + data = data.replace("<", "<") + data = data.replace("<", "<") + data = data.replace(">", ">") + data = data.replace(">", ">") + data = data.replace(">", ">") + data = data.replace("&", "&") + data = data.replace("&", "&") + data = data.replace(""", """) + data = data.replace(""", """) + data = data.replace("'", "'") + data = data.replace("'", "'") + if "type" in self.contentparams and not self.contentparams.get("type", "xml").endswith("xml"): + data = data.replace("<", "<") + data = data.replace(">", ">") + data = data.replace("&", "&") + data = data.replace(""", '"') + data = data.replace("'", "'") return data def strattrs(self, attrs): - return ''.join([f' {k}="{v}"' for k, v in attrs]) + return "".join([f' {k}="{v}"' for k, v in attrs]) class _RelativeURIResolver(_BaseHTMLProcessor): - relative_uris = [('a', 'href'), - ('applet', 'codebase'), - ('area', 'href'), - ('blockquote', 'cite'), - ('body', 'background'), - ('del', 'cite'), - ('form', 'action'), - ('frame', 'longdesc'), - ('frame', 'src'), - ('iframe', 'longdesc'), - ('iframe', 'src'), - ('head', 'profile'), - ('img', 'longdesc'), - ('img', 'src'), - ('img', 'usemap'), - ('input', 'src'), - ('input', 'usemap'), - ('ins', 'cite'), - ('link', 'href'), - ('object', 'classid'), - ('object', 'codebase'), - ('object', 'data'), - ('object', 'usemap'), - ('q', 'cite'), - ('script', 'src')] - - def __init__(self, baseuri, encoding): + relative_uris = [ + ("a", "href"), + ("applet", "codebase"), + ("area", "href"), + ("blockquote", "cite"), + ("body", "background"), + ("del", "cite"), + ("form", "action"), + ("frame", "longdesc"), + ("frame", "src"), + ("iframe", "longdesc"), + ("iframe", "src"), + ("head", "profile"), + ("img", "longdesc"), + ("img", "src"), + ("img", "usemap"), + ("input", "src"), + ("input", "usemap"), + ("ins", "cite"), + ("link", "href"), + ("object", "classid"), + ("object", "codebase"), + ("object", "data"), + ("object", "usemap"), + ("q", "cite"), + ("script", "src"), + ] + + def __init__(self, baseuri, encoding) -> None: _BaseHTMLProcessor.__init__(self, encoding) self.baseuri = baseuri def resolveURI(self, uri): return _urljoin(self.baseuri, uri) - def unknown_starttag(self, tag, attrs): + def unknown_starttag(self, tag, attrs) -> None: attrs = self.normalize_attrs(attrs) attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) + def _resolveRelativeURIs(htmlSource, baseURI, encoding): - if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') + if _debug: + sys.stderr.write("entering _resolveRelativeURIs\n") p = _RelativeURIResolver(baseURI, encoding) p.feed(htmlSource) return p.output() + class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', - 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', - 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', - 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', - 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', - 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', - 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', - 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', - 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', - 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', - 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', - 'usemap', 'valign', 'value', 'vspace', 'width', 'xml:lang'] - - unacceptable_elements_with_end_tag = ['script', 'applet'] - - def reset(self): + acceptable_elements = [ + "a", + "abbr", + "acronym", + "address", + "area", + "b", + "big", + "blockquote", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "fieldset", + "font", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "map", + "menu", + "ol", + "optgroup", + "option", + "p", + "pre", + "q", + "s", + "samp", + "select", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "tbody", + "td", + "textarea", + "tfoot", + "th", + "thead", + "tr", + "tt", + "u", + "ul", + "var", + ] + + acceptable_attributes = [ + "abbr", + "accept", + "accept-charset", + "accesskey", + "action", + "align", + "alt", + "axis", + "border", + "cellpadding", + "cellspacing", + "char", + "charoff", + "charset", + "checked", + "cite", + "class", + "clear", + "cols", + "colspan", + "color", + "compact", + "coords", + "datetime", + "dir", + "disabled", + "enctype", + "for", + "frame", + "headers", + "height", + "href", + "hreflang", + "hspace", + "id", + "ismap", + "label", + "lang", + "longdesc", + "maxlength", + "media", + "method", + "multiple", + "name", + "nohref", + "noshade", + "nowrap", + "prompt", + "readonly", + "rel", + "rev", + "rows", + "rowspan", + "rules", + "scope", + "selected", + "shape", + "size", + "span", + "src", + "start", + "summary", + "tabindex", + "target", + "title", + "type", + "usemap", + "valign", + "value", + "vspace", + "width", + "xml:lang", + ] + + unacceptable_elements_with_end_tag = ["script", "applet"] + + def reset(self) -> None: _BaseHTMLProcessor.reset(self) self.unacceptablestack = 0 - def unknown_starttag(self, tag, attrs): - if not tag in self.acceptable_elements: + def unknown_starttag(self, tag, attrs) -> None: + if tag not in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack += 1 return @@ -1724,23 +2254,24 @@ def unknown_starttag(self, tag, attrs): attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - def unknown_endtag(self, tag): - if not tag in self.acceptable_elements: + def unknown_endtag(self, tag) -> None: + if tag not in self.acceptable_elements: if tag in self.unacceptable_elements_with_end_tag: self.unacceptablestack -= 1 return _BaseHTMLProcessor.unknown_endtag(self, tag) - def handle_pi(self, text): + def handle_pi(self, text) -> None: pass - def handle_decl(self, text): + def handle_decl(self, text) -> None: pass - def handle_data(self, text): + def handle_data(self, text) -> None: if not self.unacceptablestack: _BaseHTMLProcessor.handle_data(self, text) + # TODO(py2to3): replace tidy and mx def _sanitizeHTML(htmlSource, encoding): p = _HTMLSanitizer(encoding) @@ -1753,36 +2284,41 @@ def _sanitizeHTML(htmlSource, encoding): for tidy_interface in PREFERRED_TIDY_INTERFACES: try: if tidy_interface == "uTidy": - from tidy import parseString as _utidy + def _tidy(data, **kwargs): return str(_utidy(data, **kwargs)) + break elif tidy_interface == "mxTidy": from mx.Tidy import Tidy as _mxtidy + def _tidy(data, **kwargs): nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) return data + break except: pass if _tidy: - utf8 = type(data) == type('') + utf8 = type(data) == str if utf8: - data = data.encode('utf-8') + data = data.encode("utf-8") data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") if utf8: - data = str(data, 'utf-8') - if data.count(''): - data = data.split('>', 1)[1] - if data.count('"): + data = data.split(">", 1)[1] + if data.count("= '2.3.3' - assert base64 != None - user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') - realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] + assert sys.version.split()[0] >= "2.3.3" + assert base64 is not None + user, passw = base64.decodestring(req.headers["Authorization"].split(" ")[1]).split(":") + realm = re.findall('realm="([^"]*)"', headers["WWW-Authenticate"])[0] self.add_password(realm, host, user, passw) - retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) + retry = self.http_error_auth_reqed("www-authenticate", host, req, headers) self.reset_retry_count() return retry except: return self.http_error_default(req, fp, code, msg, headers) + def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): - """URL, filename, or string --> stream + """URL, filename, or string --> stream. This function lets you define parsers that take any input source (URL, pathname to local or network file, or actual data as a string) @@ -1862,14 +2399,13 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h If handlers is supplied, it is a list of handlers used to build a urllib2 opener. """ - - if hasattr(url_file_stream_or_string, 'read'): + if hasattr(url_file_stream_or_string, "read"): return url_file_stream_or_string - if url_file_stream_or_string == '-': + if url_file_stream_or_string == "-": return sys.stdin - if urllib.parse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): + if urllib.parse.urlparse(url_file_stream_or_string)[0] in ("http", "https", "ftp"): if not agent: agent = USER_AGENT # test for inline user:password for basic auth @@ -1880,42 +2416,54 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h if realhost: user_passwd, realhost = urllib.splituser(realhost) if user_passwd: - url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) + url_file_stream_or_string = f"{urltype}://{realhost}{rest}" auth = base64.encodestring(user_passwd).strip() # try to open with urllib2 (to use optional headers) request = urllib.request.Request(url_file_stream_or_string) - request.add_header('User-Agent', agent) + request.add_header("User-Agent", agent) if etag: - request.add_header('If-None-Match', etag) + request.add_header("If-None-Match", etag) if modified: # format into an RFC 1123-compliant timestamp. We can't use # time.strftime() since the %a and %b directives can be affected # by the current locale, but RFC 2616 states that dates must be # in English. - short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] - request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) + short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] + months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] + request.add_header( + "If-Modified-Since", + "%s, %02d %s %04d %02d:%02d:%02d GMT" + % ( + short_weekdays[modified[6]], + modified[2], + months[modified[1] - 1], + modified[0], + modified[3], + modified[4], + modified[5], + ), + ) if referrer: - request.add_header('Referer', referrer) + request.add_header("Referer", referrer) if gzip and zlib: - request.add_header('Accept-encoding', 'gzip, deflate') + request.add_header("Accept-encoding", "gzip, deflate") elif gzip: - request.add_header('Accept-encoding', 'gzip') + request.add_header("Accept-encoding", "gzip") elif zlib: - request.add_header('Accept-encoding', 'deflate') + request.add_header("Accept-encoding", "deflate") else: - request.add_header('Accept-encoding', '') + request.add_header("Accept-encoding", "") if auth: - request.add_header('Authorization', 'Basic %s' % auth) + request.add_header("Authorization", f"Basic {auth}") if ACCEPT_HEADER: - request.add_header('Accept', ACCEPT_HEADER) - request.add_header('A-IM', 'feed') # RFC 3229 support - opener = urllib.request.build_opener(*([_FeedURLHandler()] + handlers)) - opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent + request.add_header("Accept", ACCEPT_HEADER) + request.add_header("A-IM", "feed") # RFC 3229 support + opener = urllib.request.build_opener(*([_FeedURLHandler(), *handlers])) + opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent try: return opener.open(request) finally: - opener.close() # JohnD + opener.close() # JohnD # try to open with native open function (if url_file_stream_or_string is a filename) try: @@ -1926,11 +2474,15 @@ def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, h # treat url_file_stream_or_string as string return _StringIO(str(url_file_stream_or_string)) + _date_handlers = [] -def registerDateHandler(func): - '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' + + +def registerDateHandler(func) -> None: + """Register a date handler function (takes string, returns 9-tuple date in GMT).""" _date_handlers.insert(0, func) + # ISO-8601 date parsing routines written by Fazal Majid. # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 # parser is beyond the scope of feedparser and would be a worthwhile addition @@ -1940,67 +2492,73 @@ def registerDateHandler(func): # 0301-04-01), so we use templates instead. # Please note the order in templates is significant because we need a # greedy match. -_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', - 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', - '-YY-?MM', '-OOO', '-YY', - '--MM-?DD', '--MM', - '---DD', - 'CC', ''] +_iso8601_tmpl = [ + "YYYY-?MM-?DD", + "YYYY-MM", + "YYYY-?OOO", + "YY-?MM-?DD", + "YY-?OOO", + "YYYY", + "-YY-?MM", + "-OOO", + "-YY", + "--MM-?DD", + "--MM", + "---DD", + "CC", + "", +] _iso8601_re = [ - tmpl.replace( - 'YYYY', r'(?P\d{4})').replace( - 'YY', r'(?P\d\d)').replace( - 'MM', r'(?P[01]\d)').replace( - 'DD', r'(?P[0123]\d)').replace( - 'OOO', r'(?P[0123]\d\d)').replace( - 'CC', r'(?P\d\d$)') - + r'(T?(?P\d{2}):(?P\d{2})' - + r'(:(?P\d{2}))?' - + r'(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?' - for tmpl in _iso8601_tmpl] + tmpl.replace("YYYY", r"(?P\d{4})") + .replace("YY", r"(?P\d\d)") + .replace("MM", r"(?P[01]\d)") + .replace("DD", r"(?P[0123]\d)") + .replace("OOO", r"(?P[0123]\d\d)") + .replace("CC", r"(?P\d\d$)") + + r"(T?(?P\d{2}):(?P\d{2})" + + r"(:(?P\d{2}))?" + + r"(?P[+-](?P\d{2})(:(?P\d{2}))?|Z)?)?" + for tmpl in _iso8601_tmpl +] _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] def _parse_date_iso8601(dateString): - '''Parse a variety of ISO-8601-compatible formats like 20040105''' + """Parse a variety of ISO-8601-compatible formats like 20040105.""" m = None for _iso8601_match in _iso8601_matches: m = _iso8601_match(dateString) - if m: break - if not m: return - if m.span() == (0, 0): return + if m: + break + if not m: + return None + if m.span() == (0, 0): + return None params = m.groupdict() - ordinal = params.get('ordinal', 0) - if ordinal: - ordinal = int(ordinal) - else: - ordinal = 0 - year = params.get('year', '--') - if not year or year == '--': + ordinal = params.get("ordinal", 0) + ordinal = int(ordinal) if ordinal else 0 + year = params.get("year", "--") + if not year or year == "--": year = time.gmtime()[0] elif len(year) == 2: # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 year = 100 * int(time.gmtime()[0] / 100) + int(year) else: year = int(year) - month = params.get('month', '-') - if not month or month == '-': + month = params.get("month", "-") + if not month or month == "-": # ordinals are NOT normalized by mktime, we simulate them # by setting month=1, day=ordinal - if ordinal: - month = 1 - else: - month = time.gmtime()[1] + month = 1 if ordinal else time.gmtime()[1] month = int(month) - day = params.get('day', 0) + day = params.get("day", 0) if not day: # see above if ordinal: day = ordinal - elif params.get('century', 0) or \ - params.get('year', 0) or params.get('month', 0): + elif params.get("century", 0) or params.get("year", 0) or params.get("month", 0): day = 1 else: day = time.gmtime()[2] @@ -2008,15 +2566,15 @@ def _parse_date_iso8601(dateString): day = int(day) # special case of the century - is the first year of the 21st century # 2000 or 2001 ? The debate goes on... - if 'century' in params.keys(): - year = (int(params['century']) - 1) * 100 + 1 + if "century" in params: + year = (int(params["century"]) - 1) * 100 + 1 # in ISO 8601 most fields are optional - for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: + for field in ["hour", "minute", "second", "tzhour", "tzmin"]: if not params.get(field, None): params[field] = 0 - hour = int(params.get('hour', 0)) - minute = int(params.get('minute', 0)) - second = int(params.get('second', 0)) + hour = int(params.get("hour", 0)) + minute = int(params.get("minute", 0)) + second = int(params.get("second", 0)) # weekday is normalized by mktime(), we can ignore it weekday = 0 # daylight savings is complex, but not needed for feedparser's purposes @@ -2024,192 +2582,224 @@ def _parse_date_iso8601(dateString): # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and # and most implementations have DST bugs daylight_savings_flag = 0 - tm = [year, month, day, hour, minute, second, weekday, - ordinal, daylight_savings_flag] + tm = [year, month, day, hour, minute, second, weekday, ordinal, daylight_savings_flag] # ISO 8601 time zone adjustments - tz = params.get('tz') - if tz and tz != 'Z': - if tz[0] == '-': - tm[3] += int(params.get('tzhour', 0)) - tm[4] += int(params.get('tzmin', 0)) - elif tz[0] == '+': - tm[3] -= int(params.get('tzhour', 0)) - tm[4] -= int(params.get('tzmin', 0)) + tz = params.get("tz") + if tz and tz != "Z": + if tz[0] == "-": + tm[3] += int(params.get("tzhour", 0)) + tm[4] += int(params.get("tzmin", 0)) + elif tz[0] == "+": + tm[3] -= int(params.get("tzhour", 0)) + tm[4] -= int(params.get("tzmin", 0)) else: return None # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) # which is guaranteed to normalize d/m/y/h/m/s. # Many implementations have bugs, but we'll pretend they don't. return time.localtime(time.mktime(tm)) + + registerDateHandler(_parse_date_iso8601) # 8-bit date handling routines written by ytrewq1. -_korean_year = '\ub144' # b3e2 in euc-kr -_korean_month = '\uc6d4' # bff9 in euc-kr -_korean_day = '\uc77c' # c0cf in euc-kr -_korean_am = '\uc624\uc804' # bfc0 c0fc in euc-kr -_korean_pm = '\uc624\ud6c4' # bfc0 c8c4 in euc-kr - -_korean_onblog_date_re = \ - re.compile(r'(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ - (_korean_year, _korean_month, _korean_day)) -_korean_nate_date_re = \ - re.compile(r'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ - (_korean_am, _korean_pm)) +_korean_year = "\ub144" # b3e2 in euc-kr +_korean_month = "\uc6d4" # bff9 in euc-kr +_korean_day = "\uc77c" # c0cf in euc-kr +_korean_am = "\uc624\uc804" # bfc0 c0fc in euc-kr +_korean_pm = "\uc624\ud6c4" # bfc0 c8c4 in euc-kr + +_korean_onblog_date_re = re.compile( + rf"(\d{{4}}){_korean_year}\s+(\d{{2}}){_korean_month}\s+(\d{{2}}){_korean_day}\s+(\d{{2}}):(\d{{2}}):(\d{{2}})" +) +_korean_nate_date_re = re.compile( + rf"(\d{{4}})-(\d{{2}})-(\d{{2}})\s+({_korean_am}|{_korean_pm})\s+(\d{{,2}}):(\d{{,2}}):(\d{{,2}})" +) + + def _parse_date_onblog(dateString): - '''Parse a string according to the OnBlog 8-bit date format''' + """Parse a string according to the OnBlog 8-bit date format.""" m = _korean_onblog_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) + if not m: + return None + w3dtfdate = "{year}-{month}-{day}T{hour}:{minute}:{second}{zonediff}".format( + year=m.group(1), + month=m.group(2), + day=m.group(3), + hour=m.group(4), + minute=m.group(5), + second=m.group(6), + zonediff="+09:00", + ) + if _debug: + sys.stderr.write(f"OnBlog date parsed as: {w3dtfdate}\n") return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_onblog) + def _parse_date_nate(dateString): - '''Parse a string according to the Nate 8-bit date format''' + """Parse a string according to the Nate 8-bit date format.""" m = _korean_nate_date_re.match(dateString) - if not m: return + if not m: + return None hour = int(m.group(5)) ampm = m.group(4) - if (ampm == _korean_pm): + if ampm == _korean_pm: hour += 12 hour = str(hour) if len(hour) == 1: - hour = '0' + hour - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) + hour = "0" + hour + w3dtfdate = "{year}-{month}-{day}T{hour}:{minute}:{second}{zonediff}".format( + year=m.group(1), + month=m.group(2), + day=m.group(3), + hour=hour, + minute=m.group(6), + second=m.group(7), + zonediff="+09:00", + ) + if _debug: + sys.stderr.write(f"Nate date parsed as: {w3dtfdate}\n") return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_nate) -_mssql_date_re = \ - re.compile(r'(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') +_mssql_date_re = re.compile(r"(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?") + + def _parse_date_mssql(dateString): - '''Parse a string according to the MS SQL date format''' + """Parse a string according to the MS SQL date format.""" m = _mssql_date_re.match(dateString) - if not m: return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ - {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ - 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ - 'zonediff': '+09:00'} - if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) + if not m: + return None + w3dtfdate = "{year}-{month}-{day}T{hour}:{minute}:{second}{zonediff}".format( + year=m.group(1), + month=m.group(2), + day=m.group(3), + hour=m.group(4), + minute=m.group(5), + second=m.group(6), + zonediff="+09:00", + ) + if _debug: + sys.stderr.write(f"MS SQL date parsed as: {w3dtfdate}\n") return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_mssql) # Unicode strings for Greek date strings -_greek_months = \ - { \ - '\u0399\u03b1\u03bd': 'Jan', # c9e1ed in iso-8859-7 - '\u03a6\u03b5\u03b2': 'Feb', # d6e5e2 in iso-8859-7 - '\u039c\u03ac\u03ce': 'Mar', # ccdcfe in iso-8859-7 - '\u039c\u03b1\u03ce': 'Mar', # cce1fe in iso-8859-7 - '\u0391\u03c0\u03c1': 'Apr', # c1f0f1 in iso-8859-7 - '\u039c\u03ac\u03b9': 'May', # ccdce9 in iso-8859-7 - '\u039c\u03b1\u03ca': 'May', # cce1fa in iso-8859-7 - '\u039c\u03b1\u03b9': 'May', # cce1e9 in iso-8859-7 - '\u0399\u03bf\u03cd\u03bd': 'Jun', # c9effded in iso-8859-7 - '\u0399\u03bf\u03bd': 'Jun', # c9efed in iso-8859-7 - '\u0399\u03bf\u03cd\u03bb': 'Jul', # c9effdeb in iso-8859-7 - '\u0399\u03bf\u03bb': 'Jul', # c9f9eb in iso-8859-7 - '\u0391\u03cd\u03b3': 'Aug', # c1fde3 in iso-8859-7 - '\u0391\u03c5\u03b3': 'Aug', # c1f5e3 in iso-8859-7 - '\u03a3\u03b5\u03c0': 'Sep', # d3e5f0 in iso-8859-7 - '\u039f\u03ba\u03c4': 'Oct', # cfeaf4 in iso-8859-7 - '\u039d\u03bf\u03ad': 'Nov', # cdefdd in iso-8859-7 - '\u039d\u03bf\u03b5': 'Nov', # cdefe5 in iso-8859-7 - '\u0394\u03b5\u03ba': 'Dec', # c4e5ea in iso-8859-7 - } - -_greek_wdays = \ - { \ - '\u039a\u03c5\u03c1': 'Sun', # caf5f1 in iso-8859-7 - '\u0394\u03b5\u03c5': 'Mon', # c4e5f5 in iso-8859-7 - '\u03a4\u03c1\u03b9': 'Tue', # d4f1e9 in iso-8859-7 - '\u03a4\u03b5\u03c4': 'Wed', # d4e5f4 in iso-8859-7 - '\u03a0\u03b5\u03bc': 'Thu', # d0e5ec in iso-8859-7 - '\u03a0\u03b1\u03c1': 'Fri', # d0e1f1 in iso-8859-7 - '\u03a3\u03b1\u03b2': 'Sat', # d3e1e2 in iso-8859-7 - } - -_greek_date_format_re = \ - re.compile(r'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') +_greek_months = { + "\u0399\u03b1\u03bd": "Jan", # c9e1ed in iso-8859-7 + "\u03a6\u03b5\u03b2": "Feb", # d6e5e2 in iso-8859-7 + "\u039c\u03ac\u03ce": "Mar", # ccdcfe in iso-8859-7 + "\u039c\u03b1\u03ce": "Mar", # cce1fe in iso-8859-7 + "\u0391\u03c0\u03c1": "Apr", # c1f0f1 in iso-8859-7 + "\u039c\u03ac\u03b9": "May", # ccdce9 in iso-8859-7 + "\u039c\u03b1\u03ca": "May", # cce1fa in iso-8859-7 + "\u039c\u03b1\u03b9": "May", # cce1e9 in iso-8859-7 + "\u0399\u03bf\u03cd\u03bd": "Jun", # c9effded in iso-8859-7 + "\u0399\u03bf\u03bd": "Jun", # c9efed in iso-8859-7 + "\u0399\u03bf\u03cd\u03bb": "Jul", # c9effdeb in iso-8859-7 + "\u0399\u03bf\u03bb": "Jul", # c9f9eb in iso-8859-7 + "\u0391\u03cd\u03b3": "Aug", # c1fde3 in iso-8859-7 + "\u0391\u03c5\u03b3": "Aug", # c1f5e3 in iso-8859-7 + "\u03a3\u03b5\u03c0": "Sep", # d3e5f0 in iso-8859-7 + "\u039f\u03ba\u03c4": "Oct", # cfeaf4 in iso-8859-7 + "\u039d\u03bf\u03ad": "Nov", # cdefdd in iso-8859-7 + "\u039d\u03bf\u03b5": "Nov", # cdefe5 in iso-8859-7 + "\u0394\u03b5\u03ba": "Dec", # c4e5ea in iso-8859-7 +} + +_greek_wdays = { + "\u039a\u03c5\u03c1": "Sun", # caf5f1 in iso-8859-7 + "\u0394\u03b5\u03c5": "Mon", # c4e5f5 in iso-8859-7 + "\u03a4\u03c1\u03b9": "Tue", # d4f1e9 in iso-8859-7 + "\u03a4\u03b5\u03c4": "Wed", # d4e5f4 in iso-8859-7 + "\u03a0\u03b5\u03bc": "Thu", # d0e5ec in iso-8859-7 + "\u03a0\u03b1\u03c1": "Fri", # d0e1f1 in iso-8859-7 + "\u03a3\u03b1\u03b2": "Sat", # d3e1e2 in iso-8859-7 +} + +_greek_date_format_re = re.compile(r"([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)") + def _parse_date_greek(dateString): - '''Parse a string according to a Greek 8-bit date format.''' + """Parse a string according to a Greek 8-bit date format.""" m = _greek_date_format_re.match(dateString) - if not m: return + if not m: + return None try: wday = _greek_wdays[m.group(1)] month = _greek_months[m.group(3)] except: - return - rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ - {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ - 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ - 'zonediff': m.group(8)} - if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) + return None + rfc822date = f"{wday}, {m.group(2)} {month} {m.group(4)} {m.group(5)}:{m.group(6)}:{m.group(7)} {m.group(8)}" + if _debug: + sys.stderr.write(f"Greek date parsed as: {rfc822date}\n") return _parse_date_rfc822(rfc822date) + + registerDateHandler(_parse_date_greek) # Unicode strings for Hungarian date strings -_hungarian_months = \ - { \ - 'janu\u00e1r': '01', # e1 in iso-8859-2 - 'febru\u00e1ri': '02', # e1 in iso-8859-2 - 'm\u00e1rcius': '03', # e1 in iso-8859-2 - '\u00e1prilis': '04', # e1 in iso-8859-2 - 'm\u00e1ujus': '05', # e1 in iso-8859-2 - 'j\u00fanius': '06', # fa in iso-8859-2 - 'j\u00falius': '07', # fa in iso-8859-2 - 'augusztus': '08', - 'szeptember': '09', - 'okt\u00f3ber': '10', # f3 in iso-8859-2 - 'november': '11', - 'december': '12', - } - -_hungarian_date_format_re = \ - re.compile(r'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') +_hungarian_months = { + "janu\u00e1r": "01", # e1 in iso-8859-2 + "febru\u00e1ri": "02", # e1 in iso-8859-2 + "m\u00e1rcius": "03", # e1 in iso-8859-2 + "\u00e1prilis": "04", # e1 in iso-8859-2 + "m\u00e1ujus": "05", # e1 in iso-8859-2 + "j\u00fanius": "06", # fa in iso-8859-2 + "j\u00falius": "07", # fa in iso-8859-2 + "augusztus": "08", + "szeptember": "09", + "okt\u00f3ber": "10", # f3 in iso-8859-2 + "november": "11", + "december": "12", +} + +_hungarian_date_format_re = re.compile(r"(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))") + def _parse_date_hungarian(dateString): - '''Parse a string according to a Hungarian 8-bit date format.''' + """Parse a string according to a Hungarian 8-bit date format.""" m = _hungarian_date_format_re.match(dateString) - if not m: return + if not m: + return None try: month = _hungarian_months[m.group(2)] day = m.group(3) if len(day) == 1: - day = '0' + day + day = "0" + day hour = m.group(4) if len(hour) == 1: - hour = '0' + hour + hour = "0" + hour except: - return - w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ - {'year': m.group(1), 'month': month, 'day': day,\ - 'hour': hour, 'minute': m.group(5),\ - 'zonediff': m.group(6)} - if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) + return None + w3dtfdate = f"{m.group(1)}-{month}-{day}T{hour}:{m.group(5)}{m.group(6)}" + if _debug: + sys.stderr.write(f"Hungarian date parsed as: {w3dtfdate}\n") return _parse_date_w3dtf(w3dtfdate) + + registerDateHandler(_parse_date_hungarian) + # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by # Drake and licensed under the Python license. Removed all range checking # for month, day, hour, minute, and second, since mktime will normalize # these later def _parse_date_w3dtf(dateString): def __extract_date(m): - year = int(m.group('year')) + year = int(m.group("year")) if year < 100: year = 100 * int(time.gmtime()[0] / 100) + int(year) if year < 1000: return 0, 0, 0 - julian = m.group('julian') + julian = m.group("julian") if julian: julian = int(julian) month = julian / 30 + 1 @@ -2227,124 +2817,127 @@ def __extract_date(m): day = 31 elif jday < julian: if day + diff < 28: - day = day + diff + day = day + diff else: month = month + 1 return year, month, day - month = m.group('month') + month = m.group("month") day = 1 if month is None: month = 1 else: month = int(month) - day = m.group('day') - if day: - day = int(day) - else: - day = 1 + day = m.group("day") + day = int(day) if day else 1 return year, month, day def __extract_time(m): if not m: return 0, 0, 0 - hours = m.group('hours') + hours = m.group("hours") if not hours: return 0, 0, 0 hours = int(hours) - minutes = int(m.group('minutes')) - seconds = m.group('seconds') - if seconds: - seconds = int(seconds) - else: - seconds = 0 + minutes = int(m.group("minutes")) + seconds = m.group("seconds") + seconds = int(seconds) if seconds else 0 return hours, minutes, seconds def __extract_tzd(m): - '''Return the Time Zone Designator as an offset in seconds from UTC.''' + """Return the Time Zone Designator as an offset in seconds from UTC.""" if not m: return 0 - tzd = m.group('tzd') + tzd = m.group("tzd") if not tzd: return 0 - if tzd == 'Z': + if tzd == "Z": return 0 - hours = int(m.group('tzdhours')) - minutes = m.group('tzdminutes') - if minutes: - minutes = int(minutes) - else: - minutes = 0 - offset = (hours*60 + minutes) * 60 - if tzd[0] == '+': + hours = int(m.group("tzdhours")) + minutes = m.group("tzdminutes") + minutes = int(minutes) if minutes else 0 + offset = (hours * 60 + minutes) * 60 + if tzd[0] == "+": return -offset return offset - __date_re = (r'(?P\d\d\d\d)' - '(?:(?P-|)' - r'(?:(?P\d\d\d)' - r'|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?') - __tzd_re = r'(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)' + __date_re = ( + r"(?P\d\d\d\d)" + "(?:(?P-|)" + r"(?:(?P\d\d\d)" + r"|(?P\d\d)(?:(?P=dsep)(?P\d\d))?))?" + ) + __tzd_re = r"(?P[-+](?P\d\d)(?::?(?P\d\d))|Z)" __tzd_rx = re.compile(__tzd_re) - __time_re = (r'(?P\d\d)(?P:|)(?P\d\d)' - r'(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?' - + __tzd_re) - __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) + __time_re = ( + r"(?P\d\d)(?P:|)(?P\d\d)" r"(?:(?P=tsep)(?P\d\d(?:[.,]\d+)?))?" + __tzd_re + ) + __datetime_re = f"{__date_re}(?:T{__time_re})?" __datetime_rx = re.compile(__datetime_re) m = __datetime_rx.match(dateString) - if (m is None) or (m.group() != dateString): return + if (m is None) or (m.group() != dateString): + return None gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) - if gmt[0] == 0: return + if gmt[0] == 0: + return None return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) + + registerDateHandler(_parse_date_w3dtf) def _parse_date_rfc822(dateString): - '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' + """Parse an RFC822, RFC1123, RFC2822, or asctime-style date.""" data = dateString.split() - if data[0][-1] in (',', '.') or data[0].lower() in ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']: + if data[0][-1] in (",", ".") or data[0].lower() in ["mon", "tue", "wed", "thu", "fri", "sat", "sun"]: del data[0] if len(data) == 4: s = data[3] - i = s.find('+') + i = s.find("+") if i > 0: - data[3:] = [s[:i], s[i+1:]] + data[3:] = [s[:i], s[i + 1 :]] else: - data.append('') + data.append("") dateString = " ".join(data) if len(data) < 5: - dateString += ' 00:00:00 GMT' + dateString += " 00:00:00 GMT" tm = email.utils.parsedate_tz(dateString) if tm: return time.gmtime(calendar.timegm(tm[:9])) + return None + # Define additional time zones -_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} +_additional_timezones = {"AT": -400, "ET": -500, "CT": -600, "MT": -700, "PT": -800} email.utils._LOCALTZONES.update(_additional_timezones) registerDateHandler(_parse_date_rfc822) + def _parse_date(dateString): - '''Parses a variety of date formats into a 9-tuple in GMT''' + """Parses a variety of date formats into a 9-tuple in GMT.""" for handler in _date_handlers: try: date9tuple = handler(dateString) - if not date9tuple: continue + if not date9tuple: + continue if len(date9tuple) != 9: - if _debug: sys.stderr.write('date handler function must return 9-tuple\n') + if _debug: + sys.stderr.write("date handler function must return 9-tuple\n") raise ValueError map(int, date9tuple) return date9tuple except Exception as e: - if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) - pass + if _debug: + sys.stderr.write(f"{handler.__name__} raised {e!r}\n") return None + def _getCharacterEncoding(http_headers, xml_data): - '''Get the character encoding of the XML document + """Get the character encoding of the XML document. http_headers is a dictionary xml_data is a raw string (not Unicode) - + This is so much trickier than it sounds, it's not even funny. According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type is application/xml, application/*+xml, @@ -2363,12 +2956,12 @@ def _getCharacterEncoding(http_headers, xml_data): served with a Content-Type of text/* and no charset parameter must be treated as us-ascii. (We now do this.) And also that it must always be flagged as non-well-formed. (We now do this too.) - + If Content-Type is unspecified (input was local file or non-HTTP source) or unrecognized (server just got it totally wrong), then go by the encoding given in the XML prefix of the document and default to 'iso-8859-1' as per the HTTP specification (RFC 2616). - + Then, assuming we didn't find a character encoding in the HTTP headers (and the HTTP Content-type allowed us to look in the body), we need to sniff the first few bytes of the XML data and try to determine @@ -2385,188 +2978,209 @@ def _getCharacterEncoding(http_headers, xml_data): correctly, which many are not). CJKCodecs and iconv_codec help a lot; you should definitely install them if you can. http://cjkpython.i18n.org/ - ''' + """ def _parseHTTPContentType(content_type): - '''takes HTTP Content-Type header and returns (content type, charset) + """Takes HTTP Content-Type header and returns (content type, charset). If no charset is specified, returns (content type, '') If no content type is specified, returns ('', '') Both return parameters are guaranteed to be lowercase strings - ''' - content_type = content_type or '' + """ + content_type = content_type or "" content_type, params = cgi.parse_header(content_type) - return content_type, params.get('charset', '').replace("'", '') + return content_type, params.get("charset", "").replace("'", "") - sniffed_xml_encoding = '' - xml_encoding = '' - true_encoding = '' - http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) + sniffed_xml_encoding = "" + xml_encoding = "" + true_encoding = "" + http_content_type, http_encoding = _parseHTTPContentType(http_headers.get("content-type")) # Must sniff for non-ASCII-compatible character encodings before # searching for XML declaration. This heuristic is defined in # section F of the XML specification: # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': + if xml_data[:4] == "\x4c\x6f\xa7\x94": # EBCDIC xml_data = _ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': + elif xml_data[:4] == "\x00\x3c\x00\x3f": # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = str(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): + sniffed_xml_encoding = "utf-16be" + xml_data = str(xml_data, "utf-16be").encode("utf-8") + elif (len(xml_data) >= 4) and (xml_data[:2] == "\xfe\xff") and (xml_data[2:4] != "\x00\x00"): # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = str(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': + sniffed_xml_encoding = "utf-16be" + xml_data = str(xml_data[2:], "utf-16be").encode("utf-8") + elif xml_data[:4] == "\x3c\x00\x3f\x00": # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = str(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): + sniffed_xml_encoding = "utf-16le" + xml_data = str(xml_data, "utf-16le").encode("utf-8") + elif (len(xml_data) >= 4) and (xml_data[:2] == "\xff\xfe") and (xml_data[2:4] != "\x00\x00"): # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = str(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': + sniffed_xml_encoding = "utf-16le" + xml_data = str(xml_data[2:], "utf-16le").encode("utf-8") + elif xml_data[:4] == "\x00\x00\x00\x3c": # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = str(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': + sniffed_xml_encoding = "utf-32be" + xml_data = str(xml_data, "utf-32be").encode("utf-8") + elif xml_data[:4] == "\x3c\x00\x00\x00": # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = str(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': + sniffed_xml_encoding = "utf-32le" + xml_data = str(xml_data, "utf-32le").encode("utf-8") + elif xml_data[:4] == "\x00\x00\xfe\xff": # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = str(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': + sniffed_xml_encoding = "utf-32be" + xml_data = str(xml_data[4:], "utf-32be").encode("utf-8") + elif xml_data[:4] == "\xff\xfe\x00\x00": # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = str(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': + sniffed_xml_encoding = "utf-32le" + xml_data = str(xml_data[4:], "utf-32le").encode("utf-8") + elif xml_data[:3] == "\xef\xbb\xbf": # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = str(xml_data[3:], 'utf-8').encode('utf-8') + sniffed_xml_encoding = "utf-8" + xml_data = str(xml_data[3:], "utf-8").encode("utf-8") else: # ASCII-compatible pass - xml_encoding_match = re.compile('^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>').match(xml_data) + xml_encoding_match = re.compile("^<\\?.*encoding=['\"](.*?)['\"].*\\?>").match(xml_data) except: xml_encoding_match = None if xml_encoding_match: xml_encoding = xml_encoding_match.groups()[0].lower() - if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): + if sniffed_xml_encoding and ( + xml_encoding + in ( + "iso-10646-ucs-2", + "ucs-2", + "csunicode", + "iso-10646-ucs-4", + "ucs-4", + "csucs4", + "utf-16", + "utf-32", + "utf_16", + "utf_32", + "utf16", + "u16", + ) + ): xml_encoding = sniffed_xml_encoding acceptable_content_type = 0 - application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') - text_content_types = ('text/xml', 'text/xml-external-parsed-entity') - if (http_content_type in application_content_types) or \ - (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): + application_content_types = ("application/xml", "application/xml-dtd", "application/xml-external-parsed-entity") + text_content_types = ("text/xml", "text/xml-external-parsed-entity") + if (http_content_type in application_content_types) or ( + http_content_type.startswith("application/") and http_content_type.endswith("+xml") + ): acceptable_content_type = 1 - true_encoding = http_encoding or xml_encoding or 'utf-8' - elif (http_content_type in text_content_types) or \ - (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): + true_encoding = http_encoding or xml_encoding or "utf-8" + elif ( + (http_content_type in text_content_types) + or (http_content_type.startswith("text/")) + and http_content_type.endswith("+xml") + ): acceptable_content_type = 1 - true_encoding = http_encoding or 'us-ascii' - elif http_content_type.startswith('text/'): - true_encoding = http_encoding or 'us-ascii' - elif http_headers and (not http_headers.has_key('content-type')): - true_encoding = xml_encoding or 'iso-8859-1' + true_encoding = http_encoding or "us-ascii" + elif http_content_type.startswith("text/"): + true_encoding = http_encoding or "us-ascii" + elif http_headers and (not http_headers.has_key("content-type")): + true_encoding = xml_encoding or "iso-8859-1" else: - true_encoding = xml_encoding or 'utf-8' + true_encoding = xml_encoding or "utf-8" return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type + def _toUTF8(data, encoding): - '''Changes an XML data stream on the fly to specify a new encoding + """Changes an XML data stream on the fly to specify a new encoding. data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already encoding is a string recognized by encodings.aliases - ''' - if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) + """ + if _debug: + sys.stderr.write(f"entering _toUTF8, trying encoding {encoding}\n") # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): + if (len(data) >= 4) and (data[:2] == "\xfe\xff") and (data[2:4] != "\x00\x00"): if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16be': - sys.stderr.write('trying utf-16be instead\n') - encoding = 'utf-16be' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-16be": + sys.stderr.write("trying utf-16be instead\n") + encoding = "utf-16be" data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): + elif (len(data) >= 4) and (data[:2] == "\xff\xfe") and (data[2:4] != "\x00\x00"): if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-16le': - sys.stderr.write('trying utf-16le instead\n') - encoding = 'utf-16le' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-16le": + sys.stderr.write("trying utf-16le instead\n") + encoding = "utf-16le" data = data[2:] - elif data[:3] == '\xef\xbb\xbf': + elif data[:3] == "\xef\xbb\xbf": if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-8': - sys.stderr.write('trying utf-8 instead\n') - encoding = 'utf-8' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-8": + sys.stderr.write("trying utf-8 instead\n") + encoding = "utf-8" data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': + elif data[:4] == "\x00\x00\xfe\xff": if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32be': - sys.stderr.write('trying utf-32be instead\n') - encoding = 'utf-32be' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-32be": + sys.stderr.write("trying utf-32be instead\n") + encoding = "utf-32be" data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': + elif data[:4] == "\xff\xfe\x00\x00": if _debug: - sys.stderr.write('stripping BOM\n') - if encoding != 'utf-32le': - sys.stderr.write('trying utf-32le instead\n') - encoding = 'utf-32le' + sys.stderr.write("stripping BOM\n") + if encoding != "utf-32le": + sys.stderr.write("trying utf-32le instead\n") + encoding = "utf-32le" data = data[4:] newdata = str(data, encoding) - if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) - declmatch = re.compile(r'^<\?xml[^>]*?>') - newdecl = '''''' - if declmatch.search(newdata): - newdata = declmatch.sub(newdecl, newdata) - else: - newdata = newdecl + '\n' + newdata - return newdata.encode('utf-8') + if _debug: + sys.stderr.write(f"successfully converted {encoding} data to unicode\n") + declmatch = re.compile(r"^<\?xml[^>]*?>") + newdecl = """""" + newdata = declmatch.sub(newdecl, newdata) if declmatch.search(newdata) else newdecl + "\n" + newdata + return newdata.encode("utf-8") + def _stripDoctype(data): - '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) + """Strips DOCTYPE from XML document, returns (rss_version, stripped_data). rss_version may be 'rss091n' or None stripped_data is the same XML document, minus the DOCTYPE - ''' - entity_pattern = re.compile(r']*?)>', re.MULTILINE) - data = entity_pattern.sub('', data) - doctype_pattern = re.compile(r']*?)>', re.MULTILINE) + """ + entity_pattern = re.compile(r"]*?)>", re.MULTILINE) + data = entity_pattern.sub("", data) + doctype_pattern = re.compile(r"]*?)>", re.MULTILINE) doctype_results = doctype_pattern.findall(data) - doctype = doctype_results and doctype_results[0] or '' - if doctype.lower().count('netscape'): - version = 'rss091n' - else: - version = None - data = doctype_pattern.sub('', data) + doctype = doctype_results and doctype_results[0] or "" + version = "rss091n" if doctype.lower().count("netscape") else None + data = doctype_pattern.sub("", data) return version, data -def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): - '''Parse a feed from a URL, file, stream, or string''' + +def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=None): + """Parse a feed from a URL, file, stream, or string.""" + if handlers is None: + handlers = [] result = FeedParserDict() - result['feed'] = FeedParserDict() - result['entries'] = [] + result["feed"] = FeedParserDict() + result["entries"] = [] if _XML_AVAILABLE: - result['bozo'] = 0 + result["bozo"] = 0 if type(handlers) == types.InstanceType: handlers = [handlers] try: f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) data = f.read() except Exception as e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' + result["bozo"] = 1 + result["bozo_exception"] = e + data = "" f = None # if feed is gzip-compressed, decompress it - if f and data and hasattr(f, 'headers'): - if gzip and f.headers.get('content-encoding', '') == 'gzip': + if f and data and hasattr(f, "headers"): + if gzip and f.headers.get("content-encoding", "") == "gzip": try: data = gzip.GzipFile(fileobj=_StringIO(data)).read() except Exception as e: @@ -2574,32 +3188,32 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # we get garbage. Ideally, we should re-request the # feed without the 'Accept-encoding: gzip' header, # but we don't. - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' - elif zlib and f.headers.get('content-encoding', '') == 'deflate': + result["bozo"] = 1 + result["bozo_exception"] = e + data = "" + elif zlib and f.headers.get("content-encoding", "") == "deflate": try: data = zlib.decompress(data, -zlib.MAX_WBITS) except Exception as e: - result['bozo'] = 1 - result['bozo_exception'] = e - data = '' + result["bozo"] = 1 + result["bozo_exception"] = e + data = "" # save HTTP headers - if hasattr(f, 'info'): + if hasattr(f, "info"): info = f.info() - result['etag'] = info.getheader('ETag') - last_modified = info.getheader('Last-Modified') + result["etag"] = info.getheader("ETag") + last_modified = info.getheader("Last-Modified") if last_modified: - result['modified'] = _parse_date(last_modified) - if hasattr(f, 'url'): - result['href'] = f.url - result['status'] = 200 - if hasattr(f, 'status'): - result['status'] = f.status - if hasattr(f, 'headers'): - result['headers'] = f.headers.dict - if hasattr(f, 'close'): + result["modified"] = _parse_date(last_modified) + if hasattr(f, "url"): + result["href"] = f.url + result["status"] = 200 + if hasattr(f, "status"): + result["status"] = f.status + if hasattr(f, "headers"): + result["headers"] = f.headers.dict + if hasattr(f, "close"): f.close() # there are four encodings to keep track of: @@ -2607,27 +3221,30 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # - xml_encoding is the encoding declared in the ; changed # project name -#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); +# 2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); # removed unnecessary urllib code -- urllib2 should always be available anyway; # return actual url, status, and full HTTP headers (as result['url'], # result['status'], and result['headers']) if parsing a remote feed over HTTP -- # this should pass all the HTTP tests at ; # added the latest namespace-of-the-week for RSS 2.0 -#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom +# 2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom # User-Agent (otherwise urllib2 sends two, which confuses some servers) -#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for +# 2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for # inline and as used in some RSS 2.0 feeds -#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or +# 2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or # textInput, and also to return the character encoding (if specified) -#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking +# 2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking # nested divs within content (JohnD); fixed missing sys import (JohanS); # fixed regular expression to capture XML character encoding (Andrei); # added support for Atom 0.3-style links; fixed bug with textInput tracking; @@ -2801,7 +3416,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # description, xhtml:body, content, content:encoded, title, subtitle, # summary, info, tagline, and copyright; added support for pingback and # trackback namespaces -#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback +# 2.7 - 1/5/2004 - MAP - really added support for trackback and pingback # namespaces, as opposed to 2.6 when I said I did but didn't really; # sanitize HTML markup within some elements; added mxTidy support (if # installed) to tidy HTML markup within some elements; fixed indentation @@ -2810,66 +3425,66 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa -#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory +# 2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory # leak not closing url opener (JohnD); added dc:publisher support (MarekK); # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) -#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in +# 2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed
tags in # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); # fixed relative URI processing for guid (skadz); added ICBM support; added # base64 support -#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many +# 2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many # blogspot.com sites); added _debug variable -#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing -#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); +# 2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing +# 3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); # added several new supported namespaces; fixed bug tracking naked markup in # description; added support for enclosure; added support for source; re-added # support for cloud which got dropped somehow; added support for expirationDate -#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking +# 3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking # xml:base URI, one for documents that don't define one explicitly and one for # documents that define an outer and an inner xml:base that goes out of scope # before the end of the document -#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level -#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] +# 3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level +# 3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; # added support for creativeCommons:license and cc:license; added support for # full Atom content model in title, tagline, info, copyright, summary; fixed bug # with gzip encoding (not always telling server we support it when we do) -#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail +# 3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail # (dictionary of 'name', 'url', 'email'); map author to author_detail if author # contains name + email address -#3.0b8 - 1/28/2004 - MAP - added support for contributor -#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added +# 3.0b8 - 1/28/2004 - MAP - added support for contributor +# 3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added # support for summary -#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from +# 3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from # xml.util.iso8601 -#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain +# 3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain # dangerous markup; fiddled with decodeEntities (not right); liberalized # date parsing even further -#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); +# 3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); # added support to Atom 0.2 subtitle; added support for Atom content model # in copyright; better sanitizing of dangerous HTML elements with end tags # (script, frameset) -#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, +# 3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, # etc.) in embedded markup, in either HTML or XHTML form (
,
,
) -#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under +# 3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under # Python 2.1 -#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; +# 3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; # fixed bug capturing author and contributor URL; fixed bug resolving relative # links in author and contributor URL; fixed bug resolvin relative links in # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's # namespace tests, and included them permanently in the test suite with his # permission; fixed namespace handling under Python 2.1 -#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) -#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 -#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); +# 3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) +# 3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 +# 3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); # use libxml2 (if available) -#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author +# 3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author # name was in parentheses; removed ultra-problematic mxTidy support; patch to # workaround crash in PyXML/expat when encountering invalid entities # (MarkMoraes); support for textinput/textInput -#3.0b20 - 4/7/2004 - MAP - added CDF support -#3.0b21 - 4/14/2004 - MAP - added Hot RSS support -#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in +# 3.0b20 - 4/7/2004 - MAP - added CDF support +# 3.0b21 - 4/14/2004 - MAP - added Hot RSS support +# 3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in # results dict; changed results dict to allow getting values with results.key # as well as results[key]; work around embedded illformed HTML with half # a DOCTYPE; work around malformed Content-Type header; if character encoding @@ -2879,19 +3494,19 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # from Unicode to raw strings before feeding data to sgmllib.SGMLParser; # convert each value in results to Unicode (if possible), even if using # regex-based parsing -#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain +# 3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain # high-bit characters in attributes in embedded HTML in description (thanks # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking # about a mapped key -#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and +# 3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could # cause the same encoding to be tried twice (even if it failed the first time); # fixed DOCTYPE stripping when DOCTYPE contained entity declarations; # better textinput and image tracking in illformed RSS 1.0 feeds -#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed +# 3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed # my blink tag tests -#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that +# 3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that # failed to parse utf-16 encoded feeds; made source into a FeedParserDict; # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; # added support for image; refactored parse() fallback logic to try other @@ -2900,14 +3515,14 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # we're properly tracking encoding in and out of BaseHTMLProcessor; set # feed.language from root-level xml:lang; set entry.id from rdf:about; # send Accept header -#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between +# 3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are # windows-1252); fixed regression that could cause the same encoding to be # tried twice (even if it failed the first time) -#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; +# 3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; # recover from malformed content-type header parameter with no equals sign # ('text/xml; charset:iso-8859-1') -#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities +# 3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities # to Unicode equivalents in illformed feeds (aaronsw); added and # passed tests for converting character entities to Unicode equivalents # in illformed feeds (aaronsw); test for valid parsers when setting @@ -2917,7 +3532,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # out of url and send as basic authentication; expose downloading-related # exceptions in bozo_exception (aaronsw); added __contains__ method to # FeedParserDict (aaronsw); added publisher_detail (aaronsw) -#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always +# 3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always # convert feed to UTF-8 before passing to XML parser; completely revamped # logic for determining character encoding and attempting XML parsing # (much faster); increased default timeout to 20 seconds; test for presence @@ -2928,7 +3543,7 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # XML parsers are available; added support for 'Content-encoding: deflate'; # send blank 'Accept-encoding: ' header if neither gzip nor zlib modules # are available -#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure +# 3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure # problem tracking xml:base and xml:lang if element declares it, child # doesn't, first grandchild redeclares it, and second grandchild doesn't; # refactored date parsing; defined public registerDateHandler so callers @@ -2948,11 +3563,11 @@ def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, refer # redirecting to a URL that returns 304, redirecting to a URL that # redirects to another URL with a different type of redirect); add # support for HTTP 303 redirects -#4.0 - MAP - support for relative URIs in xml:base attribute; fixed +# 4.0 - MAP - support for relative URIs in xml:base attribute; fixed # encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; # support for Atom 1.0; support for iTunes extensions; new 'tags' for # categories/keywords/etc. as array of dict # {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 # terminology; parse RFC 822-style dates with no time; lots of other # bug fixes -#4.1 - MAP - removed socket timeout; added support for chardet library +# 4.1 - MAP - removed socket timeout; added support for chardet library diff --git a/code/planet/sanitize.py b/code/planet/sanitize.py index 6ded170..8f20cdf 100644 --- a/code/planet/sanitize.py +++ b/code/planet/sanitize.py @@ -1,11 +1,9 @@ -""" -sanitize: bringing sanitiy to world of messed-up data +"""sanitize: bringing sanitiy to world of messed-up data. TODO: py2->3 """ -__author__ = ["Mark Pilgrim ", - "Aaron Swartz "] +__author__ = ["Mark Pilgrim ", "Aaron Swartz "] __contributors__ = ["Sam Ruby "] __license__ = "BSD" __version__ = "0.25" @@ -30,41 +28,61 @@ # Download from http://chardet.feedparser.org/ try: import chardet + if _debug: import chardet.constants + chardet.constants._debug = 1 - _chardet = lambda data: chardet.detect(data)['encoding'] + def _chardet(data): + return chardet.detect(data)["encoding"] except: chardet = None - _chardet = lambda data: None + + def _chardet(data) -> None: + return None + class _BaseHTMLProcessor(HTMLParser): - elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', - 'img', 'input', 'isindex', 'link', 'meta', 'param'] - - _r_barebang = re.compile(r'') - - def __init__(self, encoding): + _r_shorttag = re.compile(r"<([^<\s]+?)\s*/>") + + def __init__(self, encoding) -> None: self.encoding = encoding - if _debug: sys.stderr.write(f'entering BaseHTMLProcessor, encoding={self.encoding}\n') + if _debug: + sys.stderr.write(f"entering BaseHTMLProcessor, encoding={self.encoding}\n") super().__init__(convert_charrefs=False) - - def reset(self): + + def reset(self) -> None: self.pieces = [] super().reset() def _shorttag_replace(self, match): tag = match.group(1) if tag in self.elements_no_end_tag: - return '<' + tag + ' />' + return "<" + tag + " />" else: - return '<' + tag + '>' - - def feed(self, data): - data = self._r_barebang.sub(r'<!\1', data) + return "<" + tag + ">" + + def feed(self, data) -> None: + data = self._r_barebang.sub(r"<!\1", data) data = self._r_bareamp.sub("&", data) data = self._r_shorttag.sub(self._shorttag_replace, data) if self.encoding and isinstance(data, str): @@ -74,67 +92,69 @@ def feed(self, data): def normalize_attrs(self, attrs): # utility method to be called by descendants attrs = [(k.lower(), v) for k, v in attrs] - attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] - return attrs + return [(k, k in ("rel", "type") and v.lower() or v) for k, v in attrs] - def unknown_starttag(self, tag, attrs): + def unknown_starttag(self, tag, attrs) -> None: # called for each start tag # attrs is a list of (attr, value) tuples # e.g. for
, tag='pre', attrs=[('class', 'screen')]
-        if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
+        if _debug:
+            sys.stderr.write(f"_BaseHTMLProcessor, unknown_starttag, tag={tag}\n")
         uattrs = []
         # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
         for key, value in attrs:
             if type(value) != str:
                 value = str(value, self.encoding)
             uattrs.append((str(key, self.encoding), value))
-        strattrs = ''.join([f' {key}="{value}"' for key, value in uattrs]).encode(self.encoding)
+        strattrs = "".join([f' {key}="{value}"' for key, value in uattrs]).encode(self.encoding)
         if tag in self.elements_no_end_tag:
-            self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
+            self.pieces.append("<{tag}{strattrs} />".format(**locals()))
         else:
-            self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
+            self.pieces.append("<{tag}{strattrs}>".format(**locals()))
 
-    def unknown_endtag(self, tag):
+    def unknown_endtag(self, tag) -> None:
         # called for each end tag, e.g. for 
, tag will be 'pre' # Reconstruct the original end tag. if tag not in self.elements_no_end_tag: - self.pieces.append("" % locals()) + self.pieces.append("".format(**locals())) - def handle_charref(self, ref): + def handle_charref(self, ref) -> None: # called for each character reference, e.g. for ' ', ref will be '160' # Reconstruct the original character reference. - self.pieces.append('&#%(ref)s;' % locals()) - - def handle_entityref(self, ref): + self.pieces.append("&#{ref};".format(**locals())) + + def handle_entityref(self, ref) -> None: # called for each entity reference, e.g. for '©', ref will be 'copy' # Reconstruct the original entity reference. - self.pieces.append('&%(ref)s;' % locals()) + self.pieces.append("&{ref};".format(**locals())) - def handle_data(self, text): + def handle_data(self, text) -> None: # called for each block of plain text, i.e. outside of any tag and # not containing any character or entity references # Store the original text verbatim. - if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) + if _debug: + sys.stderr.write(f"_BaseHTMLProcessor, handle_text, text={text}\n") self.pieces.append(text) - - def handle_comment(self, text): + + def handle_comment(self, text) -> None: # called for each HTML comment, e.g. # Reconstruct the original comment. - self.pieces.append('' % locals()) - - def handle_pi(self, text): + self.pieces.append("".format(**locals())) + + def handle_pi(self, text) -> None: # called for each processing instruction, e.g. # Reconstruct original processing instruction. - self.pieces.append('' % locals()) + self.pieces.append("".format(**locals())) - def handle_decl(self, text): + def handle_decl(self, text) -> None: # called for the DOCTYPE, if present, e.g. # # Reconstruct original DOCTYPE - self.pieces.append('' % locals()) - - _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match + self.pieces.append("".format(**locals())) + + _new_declname_match = re.compile(r"[a-zA-Z][-_.a-zA-Z0-9:]*\s*").match + def _scan_name(self, i, declstartpos): rawdata = self.rawdata n = len(rawdata) @@ -149,69 +169,198 @@ def _scan_name(self, i, declstartpos): return name.lower(), m.end() else: self.handle_data(rawdata) -# self.updatepos(declstartpos, i) + # self.updatepos(declstartpos, i) return None, -1 def output(self): - '''Return processed HTML as a single string''' - return ''.join([str(p) for p in self.pieces]) + """Return processed HTML as a single string.""" + return "".join([str(p) for p in self.pieces]) + class _HTMLSanitizer(_BaseHTMLProcessor): - acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', - 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', - 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', - 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', - 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', - 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', - 'strong', 'sub', 'sup', 'table', 'textarea', 'tbody', 'td', 'tfoot', 'th', - 'thead', 'tr', 'tt', 'u', 'ul', 'var'] - - acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', - 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', - 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', - 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', - 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', - 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', - 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', - 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', - 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', - 'usemap', 'valign', 'value', 'vspace', 'width'] - - ignorable_elements = ['script', 'applet', 'style'] - - def reset(self): + acceptable_elements = [ + "a", + "abbr", + "acronym", + "address", + "area", + "b", + "big", + "blockquote", + "br", + "button", + "caption", + "center", + "cite", + "code", + "col", + "colgroup", + "dd", + "del", + "dfn", + "dir", + "div", + "dl", + "dt", + "em", + "fieldset", + "font", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "hr", + "i", + "img", + "input", + "ins", + "kbd", + "label", + "legend", + "li", + "map", + "menu", + "ol", + "optgroup", + "option", + "p", + "pre", + "q", + "s", + "samp", + "select", + "small", + "span", + "strike", + "strong", + "sub", + "sup", + "table", + "textarea", + "tbody", + "td", + "tfoot", + "th", + "thead", + "tr", + "tt", + "u", + "ul", + "var", + ] + + acceptable_attributes = [ + "abbr", + "accept", + "accept-charset", + "accesskey", + "action", + "align", + "alt", + "axis", + "border", + "cellpadding", + "cellspacing", + "char", + "charoff", + "charset", + "checked", + "cite", + "class", + "clear", + "cols", + "colspan", + "color", + "compact", + "coords", + "datetime", + "dir", + "disabled", + "enctype", + "for", + "frame", + "headers", + "height", + "href", + "hreflang", + "hspace", + "id", + "ismap", + "label", + "lang", + "longdesc", + "maxlength", + "media", + "method", + "multiple", + "name", + "nohref", + "noshade", + "nowrap", + "prompt", + "readonly", + "rel", + "rev", + "rows", + "rowspan", + "rules", + "scope", + "selected", + "shape", + "size", + "span", + "src", + "start", + "summary", + "tabindex", + "target", + "title", + "type", + "usemap", + "valign", + "value", + "vspace", + "width", + ] + + ignorable_elements = ["script", "applet", "style"] + + def reset(self) -> None: _BaseHTMLProcessor.reset(self) self.tag_stack = [] self.ignore_level = 0 - def feed(self, data): + def feed(self, data) -> None: _BaseHTMLProcessor.feed(self, data) while self.tag_stack: _BaseHTMLProcessor.unknown_endtag(self, self.tag_stack.pop()) - - def unknown_starttag(self, tag, attrs): + + def unknown_starttag(self, tag, attrs) -> None: if tag in self.ignorable_elements: self.ignore_level += 1 return - + if self.ignore_level: return - + if tag in self.acceptable_elements: attrs = self.normalize_attrs(attrs) attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] if tag not in self.elements_no_end_tag: self.tag_stack.append(tag) _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) - - def unknown_endtag(self, tag): + + def unknown_endtag(self, tag) -> None: if tag in self.ignorable_elements: self.ignore_level -= 1 return - + if self.ignore_level: return - + if tag in self.acceptable_elements and tag not in self.elements_no_end_tag: match = False while self.tag_stack: @@ -224,19 +373,20 @@ def unknown_endtag(self, tag): if match: _BaseHTMLProcessor.unknown_endtag(self, tag) - def handle_pi(self, text): + def handle_pi(self, text) -> None: pass - def handle_decl(self, text): + def handle_decl(self, text) -> None: pass - def handle_data(self, text): + def handle_data(self, text) -> None: if not self.ignore_level: - text = text.replace('<', '') + text = text.replace("<", "") _BaseHTMLProcessor.handle_data(self, text) + # TODO(py2to3): we need to replace `mx` and `tidy` here -def HTML(htmlSource, encoding='utf8'): +def HTML(htmlSource, encoding="utf8"): p = _HTMLSanitizer(encoding) p.feed(htmlSource) data = p.output() @@ -248,113 +398,365 @@ def HTML(htmlSource, encoding='utf8'): try: if tidy_interface == "uTidy": from tidy import parseString as _utidy + def _tidy(data, **kwargs): return str(_utidy(data, **kwargs)) + break elif tidy_interface == "mxTidy": from mx.Tidy import Tidy as _mxtidy + def _tidy(data, **kwargs): nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) return data + break except: pass if _tidy: utf8 = type(data) == str if utf8: - data = data.encode('utf-8') + data = data.encode("utf-8") data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") if utf8: - data = str(data, 'utf-8') - if data.count(''): - data = data.split('>', 1)[1] - if data.count('"): + data = data.split(">", 1)[1] + if data.count(" bool: for i, c in enumerate(bom): - if c == '#': - if text[i] == '\x00': - return False - else: - if text[i] != c: + if c == "#": + if text[i] == "\x00": return False + elif text[i] != c: + return False return True + def _detectbom(text, bom_map=unicode_bom_map): for bom, encoding in bom_map.iteritems(): if _startswithbom(text, bom): return encoding return None + def characters(text, isXML=False, guess=None): - """ - Takes a string text of unknown encoding and tries to + """Takes a string text of unknown encoding and tries to provide a Unicode string for it. """ _triedEncodings = [] + def tryEncoding(encoding): if encoding and encoding not in _triedEncodings: - if encoding == 'ebcdic': + if encoding == "ebcdic": return _ebcdic_to_ascii(text) try: return str(text, encoding) except UnicodeDecodeError: pass _triedEncodings.append(encoding) - + return None + return None + return ( - tryEncoding(guess) or - tryEncoding(_detectbom(text)) or - isXML and tryEncoding(_detectbom(text, xml_bom_map)) or - tryEncoding(_chardet(text)) or - tryEncoding('utf8') or - tryEncoding('windows-1252') or - tryEncoding('iso-8859-1')) + tryEncoding(guess) + or tryEncoding(_detectbom(text)) + or isXML + and tryEncoding(_detectbom(text, xml_bom_map)) + or tryEncoding(_chardet(text)) + or tryEncoding("utf8") + or tryEncoding("windows-1252") + or tryEncoding("iso-8859-1") + ) diff --git a/config/sort-ini.py b/config/sort-ini.py index ec040bd..9620053 100755 --- a/config/sort-ini.py +++ b/config/sort-ini.py @@ -1,46 +1,42 @@ #!/usr/bin/env python3 import sys + import ConfigParser -if len(sys.argv) > 1: - filename = sys.argv[1] -else: - filename = 'config.ini' - +filename = sys.argv[1] if len(sys.argv) > 1 else "config.ini" + oconfig = ConfigParser.RawConfigParser() oconfig.read(filename) # This part will destroy the configuration if there's a crash while # writing the output. We're in an GIT-controlled directory, so # I didn't care enough to fix this. -with open(filename, 'wb') as fd: +with open(filename, "wb") as fd: # Copy of write() code that sorts output by section if oconfig._defaults: - fd.write("[%s]\n" % DEFAULTSECT) - for (key, value) in oconfig._defaults.items(): - fd.write("{} = {}\n".format(key, str(value).replace('\n', '\n\t'))) + fd.write(f"[{DEFAULTSECT}]\n") + for key, value in oconfig._defaults.items(): + fd.write("{} = {}\n".format(key, str(value).replace("\n", "\n\t"))) fd.write("\n") - + result = {} for section in sorted(oconfig._sections): - if section == 'Planet': - fd.write("[%s]\n" % section) - for (key, value) in oconfig._sections[section].items(): + if section == "Planet": + fd.write(f"[{section}]\n") + for key, value in oconfig._sections[section].items(): if key != "__name__": - if section == 'Planet': - fd.write("%s = %s\n" % - (key, str(value).replace('\n', '\n\t'))) + if section == "Planet": + fd.write("{} = {}\n".format(key, str(value).replace("\n", "\n\t"))) else: - result[value.replace('"', '')] = section - if section == 'Planet': + result[value.replace('"', "")] = section + if section == "Planet": fd.write("\n") - + for key, value in sorted(result.items()): - fd.write("[%s]\n" % value) + fd.write(f"[{value}]\n") name = key if "'" in key: - name = '"%s"' % key - fd.write("name = %s\n" % name) + name = f'"{key}"' + fd.write(f"name = {name}\n") fd.write("\n") -