Skip to content

Commit

Permalink
Uses faster approach to detect code
Browse files Browse the repository at this point in the history
  • Loading branch information
steineggerroland committed Jan 21, 2025
1 parent 8309d65 commit 3e4fb16
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion trafilatura/htmlprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from copy import deepcopy
from typing import List, Optional, Tuple

import regex
from courlan.urlutils import fix_relative_urls, get_base_url
from lxml.etree import _Element, Element, SubElement, XPath, strip_tags, tostring
from lxml.html import HtmlElement
Expand Down Expand Up @@ -317,10 +318,17 @@ def convert_quotes(elem: _Element) -> None:
code_flag = True
for subelem in code_elems:
subelem.attrib.clear()
if elem.text and any([code_indicator in elem.text for code_indicator in CODE_INDICATORS]):
if _is_code_block_forloop(elem.text):
code_flag = True
elem.tag = "code" if code_flag else "quote"

def _is_code_block_forloop(text: str) -> bool:
if not text:
return False
for indicator in CODE_INDICATORS:
if indicator in text:
return True
return False

def convert_headings(elem: _Element) -> None:
"Add head tags and delete attributes."
Expand Down

0 comments on commit 3e4fb16

Please sign in to comment.