Skip to content

Commit

Permalink
k
Browse files Browse the repository at this point in the history
  • Loading branch information
yuhongsun96 committed Nov 22, 2024
1 parent 2b226be commit d9031ea
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions backend/danswer/indexing/chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,10 +221,14 @@ def _create_chunk(
mini_chunk_texts=self._get_mini_chunk_texts(text),
)

for section in document.sections:
for section_idx, section in enumerate(document.sections):
section_text = clean_text(section.text)
section_link_text = section.link or ""
if not section_text:
# If there is no useful content, not even the title, just drop it
if not section_text and (not document.title or section_idx > 0):
# If a section is empty and the document has no title, we can just drop it. We return a list of
# DocAwareChunks where each one contains the necessary information needed down the line for indexing.
# There is no concern about dropping whole documents from this list, it should not cause any indexing failures.
logger.warning(
f"Skipping section {section.text} from document "
f"{document.semantic_identifier} due to empty text after cleaning "
Expand Down

0 comments on commit d9031ea

Please sign in to comment.