Skip to content

Commit

Permalink
Merge pull request #483 from VikParuchuri/hotfix-1
Browse files Browse the repository at this point in the history
Patch section header issue
  • Loading branch information
VikParuchuri authored Jan 14, 2025
2 parents 675f9c8 + be1cfdf commit d154d8d
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 6 deletions.
11 changes: 7 additions & 4 deletions marker/processors/sectionheader.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ class SectionHeaderProcessor(BaseProcessor):
height_tolerance = .99

def __call__(self, document: Document):
line_heights: Dict[int, List[float]] = {}
line_heights: Dict[int, float] = {}
for page in document.pages:
for block in page.contained_blocks(document, self.block_types):
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue
if block.structure is not None:
line_heights[block.id] = block.line_height(document)
else:
Expand All @@ -54,11 +57,11 @@ def __call__(self, document: Document):
heading_ranges = self.bucket_headings(flat_line_heights)

for page in document.pages:
# Iterate children to grab all section headers
for block in page.children:
if block.block_type not in self.block_types:
continue

block_height = line_heights[block.id]
block_height = line_heights.get(block.id, 0)
if block_height > 0:
for idx, (min_height, max_height) in enumerate(heading_ranges):
if block_height >= min_height * self.height_tolerance:
Expand Down
2 changes: 1 addition & 1 deletion marker/schema/blocks/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def render(self, document: Document, parent_structure: Optional[List[str]], sect
section_hierarchy=section_hierarchy
)

def line_height(self, document: Document):
def line_height(self, document: Document) -> float:
lines = self.contained_blocks(document, (BlockTypes.Line,))
if len(lines) == 0:
return 0
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "1.2.3"
version = "1.2.4"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit d154d8d

Please sign in to comment.