Skip to content

Commit

Permalink
Added support for page within a page in Confluence
Browse files Browse the repository at this point in the history
  • Loading branch information
hagen-danswer committed Nov 13, 2024
1 parent 6066042 commit daaf2ce
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 5 deletions.
33 changes: 33 additions & 0 deletions backend/danswer/connectors/confluence/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,39 @@ def extract_text_from_confluence_html(
continue
# Include @ sign for tagging, more clear for LLM
user.replaceWith("@" + _get_user(confluence_client, user_id))

for html_page_reference in soup.findAll("ri:page"):
# Wrap this in a try-except because there are some pages that might not exist
try:
page_title = html_page_reference.attrs["ri:content-title"]
if not page_title:
continue

page_query = f"type=page and title='{page_title}'"

page_contents: dict[str, Any] | None = None
# Confluence enforces title uniqueness, so we should only get one result here
for page_batch in confluence_client.paginated_cql_page_retrieval(
cql=page_query,
expand="body.storage.value",
limit=1,
):
page_contents = page_batch[0]
break
except Exception:
logger.warning(
f"Error getting page contents for object {confluence_object}"
)
continue

if not page_contents:
continue
text_from_page = extract_text_from_confluence_html(
confluence_client, page_contents
)

html_page_reference.replaceWith(text_from_page)

return format_document_soup(soup)


Expand Down
25 changes: 20 additions & 5 deletions backend/tests/daily/connectors/confluence/test_confluence_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,24 +39,39 @@ def test_confluence_connector_basic(
with pytest.raises(StopIteration):
next(doc_batch_generator)

assert len(doc_batch) == 2
assert len(doc_batch) == 3

for doc in doc_batch:
if doc.semantic_identifier == "DailyConnectorTestSpace Home":
page_doc = doc
elif ".txt" in doc.semantic_identifier:
txt_doc = doc
elif doc.semantic_identifier == "Page Within A Page":
page_within_a_page_doc = doc

assert page_within_a_page_doc.semantic_identifier == "Page Within A Page"
assert page_within_a_page_doc.primary_owners
assert page_within_a_page_doc.primary_owners[0].email == "[email protected]"
assert len(page_within_a_page_doc.sections) == 1

page_within_a_page_section = page_within_a_page_doc.sections[0]
page_within_a_page_text = "@Chris Weaver loves cherry pie"
assert page_within_a_page_section.text == page_within_a_page_text
assert (
page_within_a_page_section.link
== "https://danswerai.atlassian.net/wiki/spaces/DailyConne/pages/200769540/Page+Within+A+Page"
)

assert page_doc.semantic_identifier == "DailyConnectorTestSpace Home"
assert page_doc.metadata["labels"] == ["testlabel"]
assert page_doc.primary_owners
assert page_doc.primary_owners[0].email == "chris@danswer.ai"
assert page_doc.primary_owners[0].email == "hagen@danswer.ai"
assert len(page_doc.sections) == 1

section = page_doc.sections[0]
assert section.text == "test123"
page_section = page_doc.sections[0]
assert page_section.text == "test123 " + page_within_a_page_text
assert (
section.link
page_section.link
== "https://danswerai.atlassian.net/wiki/spaces/DailyConne/overview"
)

Expand Down

0 comments on commit daaf2ce

Please sign in to comment.