diff --git a/marker/processors/sectionheader.py b/marker/processors/sectionheader.py index 881bc6c3..1e30de8a 100644 --- a/marker/processors/sectionheader.py +++ b/marker/processors/sectionheader.py @@ -41,9 +41,12 @@ class SectionHeaderProcessor(BaseProcessor): height_tolerance = .99 def __call__(self, document: Document): - line_heights: Dict[int, List[float]] = {} + line_heights: Dict[int, float] = {} for page in document.pages: - for block in page.contained_blocks(document, self.block_types): + # Iterate children to grab all section headers + for block in page.children: + if block.block_type not in self.block_types: + continue if block.structure is not None: line_heights[block.id] = block.line_height(document) else: @@ -54,11 +57,11 @@ def __call__(self, document: Document): heading_ranges = self.bucket_headings(flat_line_heights) for page in document.pages: + # Iterate children to grab all section headers for block in page.children: if block.block_type not in self.block_types: continue - - block_height = line_heights[block.id] + block_height = line_heights.get(block.id, 0) if block_height > 0: for idx, (min_height, max_height) in enumerate(heading_ranges): if block_height >= min_height * self.height_tolerance: diff --git a/marker/schema/blocks/base.py b/marker/schema/blocks/base.py index 4c6b5df9..dcb6c9cf 100644 --- a/marker/schema/blocks/base.py +++ b/marker/schema/blocks/base.py @@ -220,7 +220,7 @@ def render(self, document: Document, parent_structure: Optional[List[str]], sect section_hierarchy=section_hierarchy ) - def line_height(self, document: Document): + def line_height(self, document: Document) -> float: lines = self.contained_blocks(document, (BlockTypes.Line,)) if len(lines) == 0: return 0 diff --git a/pyproject.toml b/pyproject.toml index 07f1682f..7e8b1071 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.2.3" +version = "1.2.4" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"