From edb877f4bc8b56c0cee1717e609b119cd48f2a7d Mon Sep 17 00:00:00 2001 From: pablonyx Date: Sat, 21 Dec 2024 15:30:25 -0800 Subject: [PATCH] fix NUL character (#3540) --- backend/onyx/indexing/indexing_pipeline.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/backend/onyx/indexing/indexing_pipeline.py b/backend/onyx/indexing/indexing_pipeline.py index da328dc18a3..f9ed3eb7b8e 100644 --- a/backend/onyx/indexing/indexing_pipeline.py +++ b/backend/onyx/indexing/indexing_pipeline.py @@ -260,6 +260,21 @@ def index_doc_batch_prepare( def filter_documents(document_batch: list[Document]) -> list[Document]: documents: list[Document] = [] for document in document_batch: + # Remove any NUL characters from title/semantic_id + # This is a known issue with the Zendesk connector + # Postgres cannot handle NUL characters in text fields + if document.title: + document.title = document.title.replace("\x00", "") + if document.semantic_identifier: + document.semantic_identifier = document.semantic_identifier.replace( + "\x00", "" + ) + + # Remove NUL characters from all sections + for section in document.sections: + if section.text is not None: + section.text = section.text.replace("\x00", "") + empty_contents = not any(section.text.strip() for section in document.sections) if ( (not document.title or not document.title.strip())