From 264796e2cc6d6dcd69e730660eae6b12546869ab Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 20 Apr 2020 09:25:03 -0600 Subject: [PATCH 1/6] Add script for comparing layout of md files. This script finds all md files, associates all translations and analyzes the html tag structure and can detect when translations differ from the english original. Use --lang de,fr,... to restrict the operation to specific subset of languages only. Use --include-regex to only scan for matching tags. Use --exclude-regex to remove tags that are often irrelevant (e.g. /p#). The above regexes are applied to string representation of the html tag which uses the following schema: /enclosing/tag/names#attr1=val1#attr2=val2#... To match specific tag, you want to use /tagname# regex. --- _scripts/compare_md_structure.py | 112 +++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100755 _scripts/compare_md_structure.py diff --git a/_scripts/compare_md_structure.py b/_scripts/compare_md_structure.py new file mode 100755 index 000000000..77c8cf136 --- /dev/null +++ b/_scripts/compare_md_structure.py @@ -0,0 +1,112 @@ +#!/usr/bin/python3 +import io +import os +import re +import markdown +# from bs4 import BeautifulSoup +from lxml import etree +from collections import Counter +import difflib +import logging +import argparse + +logger = logging.getLogger(__name__) + +parser = argparse.ArgumentParser(description='Analyze element hierarchy of md files') +parser.add_argument( + '--langs', type=lambda s: s.split(','), + help='If specified, only analyze md files for given languages.') +parser.add_argument( + '--include-regex', + dest='include_regex', + help='If specified, only tag string representations that match this will be analyzed.') +parser.add_argument( + '--exclude-regex', + dest='exclude_regex', + help='If specificed, tag strings representations that do not match will be skipped.') +args = parser.parse_args() + +md_files = [] +for root, _, files in os.walk("."): + md_files.extend(os.path.join(root, f) for f in files if f[-3:] == ".md") + +# Construct [basename][language] --> element tree +base_lang_trees = {} +md_filenames = {} # [basename][language] --> filenames +for f in md_files: + path, base = os.path.split(f) + lang_dirs = [x for x in path.split("/") if len(x) == 2] + if not lang_dirs: + continue + lang = lang_dirs[-1] + if lang != 'en' and args.langs and lang not in args.langs: + continue + with open(f, encoding='utf-8') as fd: + try: + tree = etree.HTML(markdown.markdown(fd.read())) + base_lang_trees.setdefault(base, {})[lang] = tree + md_filenames.setdefault(base, {})[lang] = f + except etree.XMLSyntaxError: + logger.warn(f'Failed to parse {f}') + + +def tag_sequences(tree): + """Creates string representation of element tree tags. + + Document is traversed in the document order and each element + is transformed into string representation: + + /path/to/tag#attr1=value1#attr2=value2#... + + Where /path/to/tag has html and body tags removed and + whitelist/blacklist regexes are applied. + + Resulting list of tag strings is returned. + """ + tags = [] + for el in tree.iterdescendants(): + fp = [x.tag for x in el.iterancestors()] + [el.tag] + fp = [tag for tag in fp if tag not in ['html', 'body']] + path = '/'.join(fp) + attrs = '#'.join(f'{k}={v}' for k, v in sorted(el.items())) + rep = f'/{path}#{attrs}' + if not path: + continue + if args.include_regex and not re.search(args.include_regex, rep): + continue + if args.exclude_regex and re.search(args.exclude_regex, rep): + continue + tags.append(rep) + return tags + + +# Per language counters +total_files = Counter() +equal_files = Counter() + +for base, html_trees in base_lang_trees.items(): + if 'en' not in html_trees: + logger.warn(f"Do not have source file for {base}") + continue + + source_tags = tag_sequences(html_trees['en']) + for lang, tree in html_trees.items(): + #for t in tag_sequences(tree): + # print(t) + if lang == 'en': + continue + dest_tags = tag_sequences(tree) + total_files.update([lang]) + if source_tags == dest_tags: + equal_files.update([lang]) + else: + diff = difflib.unified_diff( + source_tags, dest_tags, + fromfile=md_filenames[base]['en'], + tofile=md_filenames[base][lang]) + print("\n".join(diff)) +for lang in sorted(total_files): + print( + f'[{lang}]: {equal_files[lang]}/{total_files[lang]} of analyzed pairs equal. ' + f'{total_files[lang] - equal_files[lang]} files differ. ' + f'({100.0 * equal_files[lang] / total_files[lang]:.2f}% equal)') From 385fcfc74b30314009177174f949af27788e41b7 Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 20 Apr 2020 09:36:43 -0600 Subject: [PATCH 2/6] Use non-deprecated logger.warning methods. --- _scripts/compare_md_structure.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_scripts/compare_md_structure.py b/_scripts/compare_md_structure.py index 77c8cf136..8e98e39f4 100755 --- a/_scripts/compare_md_structure.py +++ b/_scripts/compare_md_structure.py @@ -47,7 +47,7 @@ base_lang_trees.setdefault(base, {})[lang] = tree md_filenames.setdefault(base, {})[lang] = f except etree.XMLSyntaxError: - logger.warn(f'Failed to parse {f}') + logger.warning(f'Failed to parse {f}') def tag_sequences(tree): @@ -86,7 +86,7 @@ def tag_sequences(tree): for base, html_trees in base_lang_trees.items(): if 'en' not in html_trees: - logger.warn(f"Do not have source file for {base}") + logger.warning(f"Do not have source file for {base}") continue source_tags = tag_sequences(html_trees['en']) From 1c35975e09ff280764df1e46439849fcc548d11f Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 20 Apr 2020 09:38:00 -0600 Subject: [PATCH 3/6] Skip over md files in build/ directory --- _scripts/compare_md_structure.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/_scripts/compare_md_structure.py b/_scripts/compare_md_structure.py index 8e98e39f4..119bb296d 100755 --- a/_scripts/compare_md_structure.py +++ b/_scripts/compare_md_structure.py @@ -35,6 +35,8 @@ md_filenames = {} # [basename][language] --> filenames for f in md_files: path, base = os.path.split(f) + if '/build/' in path: + continue lang_dirs = [x for x in path.split("/") if len(x) == 2] if not lang_dirs: continue From 9ac1dd1352b583d7d6e81f41870cc7f6cb1f3780 Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 20 Apr 2020 09:46:09 -0600 Subject: [PATCH 4/6] Few minor stylistical changes. 1. sort imports 2. eliminate spurious newlines from diffs 3. return non-zero if diffs found 4. only report on languages where diffs are found --- _scripts/compare_md_structure.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/_scripts/compare_md_structure.py b/_scripts/compare_md_structure.py index 119bb296d..573a4b9c0 100755 --- a/_scripts/compare_md_structure.py +++ b/_scripts/compare_md_structure.py @@ -1,14 +1,15 @@ #!/usr/bin/python3 +import argparse +import difflib import io +import logging +import markdown import os import re -import markdown -# from bs4 import BeautifulSoup -from lxml import etree +import sys + from collections import Counter -import difflib -import logging -import argparse +from lxml import etree logger = logging.getLogger(__name__) @@ -105,10 +106,20 @@ def tag_sequences(tree): diff = difflib.unified_diff( source_tags, dest_tags, fromfile=md_filenames[base]['en'], - tofile=md_filenames[base][lang]) + tofile=md_filenames[base][lang], + lineterm='') print("\n".join(diff)) + +found_diffs = False for lang in sorted(total_files): + if total_files[lang] == equal_files[lang]: + logger.info('[lang]: all {total_files[lang]} equal.') + continue + found_diffs = True print( f'[{lang}]: {equal_files[lang]}/{total_files[lang]} of analyzed pairs equal. ' f'{total_files[lang] - equal_files[lang]} files differ. ' f'({100.0 * equal_files[lang] / total_files[lang]:.2f}% equal)') + +if found_diffs: + sys.exit(1) From b54768725c8f69e2395cc5c5403b1633d436859a Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 20 Apr 2020 15:16:19 -0600 Subject: [PATCH 5/6] Few operational improvements to the script. 1. added --show-tag-summaries. Shows summaries of tags that diverge. 2. Exclude alt= attribute when building string representation. This is bound to always be different. --- _scripts/compare_md_structure.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/_scripts/compare_md_structure.py b/_scripts/compare_md_structure.py index 573a4b9c0..10622109b 100755 --- a/_scripts/compare_md_structure.py +++ b/_scripts/compare_md_structure.py @@ -25,6 +25,15 @@ '--exclude-regex', dest='exclude_regex', help='If specificed, tag strings representations that do not match will be skipped.') +parser.add_argument( + '--show-tag-summaries', + action='store_true', + dest='show_tag_summaries', + help='If given, print summaries of tag names where differences were found.') + +# TODO: different modes of operation should allow: +# 1. printing diffs from each file +# 2. summarizing which tags are divergent (for each language) args = parser.parse_args() md_files = [] @@ -71,7 +80,7 @@ def tag_sequences(tree): fp = [x.tag for x in el.iterancestors()] + [el.tag] fp = [tag for tag in fp if tag not in ['html', 'body']] path = '/'.join(fp) - attrs = '#'.join(f'{k}={v}' for k, v in sorted(el.items())) + attrs = '#'.join(f'{k}={v}' for k, v in sorted(el.items()) if k not in ['alt']) rep = f'/{path}#{attrs}' if not path: continue @@ -86,6 +95,7 @@ def tag_sequences(tree): # Per language counters total_files = Counter() equal_files = Counter() +tag_summaries = Counter() for base, html_trees in base_lang_trees.items(): if 'en' not in html_trees: @@ -103,6 +113,10 @@ def tag_sequences(tree): if source_tags == dest_tags: equal_files.update([lang]) else: + sym_diff = set(source_tags).symmetric_difference(set(dest_tags)) + diff_tags = [x.split('#')[0].split('/')[-1] for x in sym_diff] + tag_summaries.update(diff_tags) + diff = difflib.unified_diff( source_tags, dest_tags, fromfile=md_filenames[base]['en'], @@ -121,5 +135,10 @@ def tag_sequences(tree): f'{total_files[lang] - equal_files[lang]} files differ. ' f'({100.0 * equal_files[lang] / total_files[lang]:.2f}% equal)') +if args.show_tag_summaries: + print('Frequency of tag discrepancies between english and translations.') + for tag, cnt in tag_summaries.most_common(): + print(f'{cnt}\t{tag}') + if found_diffs: sys.exit(1) From 3c9d259070add5748f02caf1535a088152eab3c6 Mon Sep 17 00:00:00 2001 From: Jan Rous Date: Mon, 20 Apr 2020 15:34:39 -0600 Subject: [PATCH 6/6] Add --hide-diff option and remove outdated TODO. --- _scripts/compare_md_structure.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/_scripts/compare_md_structure.py b/_scripts/compare_md_structure.py index 10622109b..a3a3baa6c 100755 --- a/_scripts/compare_md_structure.py +++ b/_scripts/compare_md_structure.py @@ -30,10 +30,11 @@ action='store_true', dest='show_tag_summaries', help='If given, print summaries of tag names where differences were found.') - -# TODO: different modes of operation should allow: -# 1. printing diffs from each file -# 2. summarizing which tags are divergent (for each language) +parser.add_argument( + '--hide-diffs', + action='store_true', + dest='hide_diffs', + help='If specified, do not show diffs for each file.') args = parser.parse_args() md_files = [] @@ -122,7 +123,8 @@ def tag_sequences(tree): fromfile=md_filenames[base]['en'], tofile=md_filenames[base][lang], lineterm='') - print("\n".join(diff)) + if not args.hide_diffs: + print("\n".join(diff)) found_diffs = False for lang in sorted(total_files):