From 8b3c4eb195388422f36db66a546168bfa5b4330b Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Mon, 13 Jan 2025 13:40:11 -0800 Subject: [PATCH 1/4] fixed group sync to account for changes in drive permissions --- .../google_drive/doc_sync.py | 9 +- .../google_drive/group_sync.py | 121 +++++++++++++++--- .../onyx/connectors/google_drive/connector.py | 6 +- .../connectors/google_drive/doc_conversion.py | 1 + .../connectors/google_drive/file_retrieval.py | 2 +- 5 files changed, 113 insertions(+), 26 deletions(-) diff --git a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py index 381975219ed..eabeeb0a8d1 100644 --- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py @@ -101,14 +101,14 @@ def _get_permissions_from_slim_doc( company_domain = google_drive_connector.google_domain user_emails: set[str] = set() - group_emails: set[str] = set() + group_names: set[str] = set() public = False for permission in permissions_list: permission_type = permission["type"] if permission_type == "user": user_emails.add(permission["emailAddress"]) elif permission_type == "group": - group_emails.add(permission["emailAddress"]) + group_names.add(permission["emailAddress"]) elif permission_type == "domain" and company_domain: if permission.get("domain") == company_domain: public = True @@ -120,9 +120,12 @@ def _get_permissions_from_slim_doc( elif permission_type == "anyone": public = True + if parent_drive_id := permission_info.get("drive_id"): + group_names.add(parent_drive_id) + return ExternalAccess( external_user_emails=user_emails, - external_user_group_ids=group_emails, + external_user_group_ids=group_names, is_public=public, ) diff --git a/backend/ee/onyx/external_permissions/google_drive/group_sync.py b/backend/ee/onyx/external_permissions/google_drive/group_sync.py index 4fc15da00d4..69f7f1ebc01 100644 --- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py @@ -1,52 +1,135 @@ from ee.onyx.db.external_perm import ExternalUserGroup from onyx.connectors.google_drive.connector import GoogleDriveConnector from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval +from onyx.connectors.google_utils.resources import AdminService from onyx.connectors.google_utils.resources import get_admin_service +from onyx.connectors.google_utils.resources import get_drive_service from onyx.db.models import ConnectorCredentialPair from onyx.utils.logger import setup_logger logger = setup_logger() -def gdrive_group_sync( - cc_pair: ConnectorCredentialPair, -) -> list[ExternalUserGroup]: - google_drive_connector = GoogleDriveConnector( - **cc_pair.connector.connector_specific_config - ) - google_drive_connector.load_credentials(cc_pair.credential.credential_json) - admin_service = get_admin_service( - google_drive_connector.creds, google_drive_connector.primary_admin_email +def _get_drive_members( + google_drive_connector: GoogleDriveConnector, +) -> dict[str, tuple[set[str], set[str]]]: + drive_ids = google_drive_connector.get_all_drive_ids() + + drive_ids_to_members_map: dict[str, tuple[set[str], set[str]]] = {} + drive_service = get_drive_service( + google_drive_connector.creds, + google_drive_connector.primary_admin_email, ) - onyx_groups: list[ExternalUserGroup] = [] + for drive_id in drive_ids: + group_emails: set[str] = set() + user_emails: set[str] = set() + for permission in execute_paginated_retrieval( + drive_service.permissions().list, + list_key="permissions", + fileId=drive_id, + fields="permissions(emailAddress, type)", + supportsAllDrives=True, + ): + if permission["type"] == "group": + group_emails.add(permission["emailAddress"]) + elif permission["type"] == "user": + user_emails.add(permission["emailAddress"]) + drive_ids_to_members_map[drive_id] = (group_emails, user_emails) + return drive_ids_to_members_map + + +def _get_all_groups( + admin_service: AdminService, + google_domain: str, +) -> set[str]: + group_emails: set[str] = set() for group in execute_paginated_retrieval( admin_service.groups().list, list_key="groups", - domain=google_drive_connector.google_domain, + domain=google_domain, fields="groups(email)", ): - # The id is the group email - group_email = group["email"] + group_emails.add(group["email"]) + return group_emails - # Gather group member emails - group_member_emails: list[str] = [] + +def _map_group_to_members( + admin_service: AdminService, + group_emails: set[str], +) -> dict[str, set[str]]: + group_to_member_map: dict[str, set[str]] = {} + for group_email in group_emails: + group_member_emails: set[str] = set() for member in execute_paginated_retrieval( admin_service.members().list, list_key="members", groupKey=group_email, fields="members(email)", ): - group_member_emails.append(member["email"]) + group_member_emails.add(member["email"]) + + group_to_member_map[group_email] = group_member_emails + return group_to_member_map + - if not group_member_emails: - continue +def _build_onyx_groups( + drive_ids_to_members_map: dict[str, tuple[set[str], set[str]]], + group_to_members_map: dict[str, set[str]], +) -> list[ExternalUserGroup]: + onyx_groups: list[ExternalUserGroup] = [] + + # Convert all drive member definitions to onyx groups + for drive_id, (group_emails, user_emails) in drive_ids_to_members_map.items(): + all_member_emails: set[str] = user_emails + for group_email in group_emails: + all_member_emails.update(group_to_members_map[group_email]) + onyx_groups.append( + ExternalUserGroup( + id=drive_id, + user_emails=all_member_emails, + ) + ) + # Convert all group member definitions to onyx groups + for group_email, member_emails in group_to_members_map.items(): onyx_groups.append( ExternalUserGroup( id=group_email, - user_emails=list(group_member_emails), + user_emails=member_emails, ) ) return onyx_groups + + +def gdrive_group_sync( + cc_pair: ConnectorCredentialPair, +) -> list[ExternalUserGroup]: + # Initialize connector and build credential/service objects + google_drive_connector = GoogleDriveConnector( + **cc_pair.connector.connector_specific_config + ) + google_drive_connector.load_credentials(cc_pair.credential.credential_json) + admin_service = get_admin_service( + google_drive_connector.creds, google_drive_connector.primary_admin_email + ) + + # Get all drive members + drive_ids_to_members_map = _get_drive_members(google_drive_connector) + + # Get all group emails + all_group_emails = _get_all_groups( + admin_service, google_drive_connector.google_domain + ) + + # Map group emails to their members + group_to_members_map = _map_group_to_members(admin_service, all_group_emails) + + # Convert the maps to onyx groups + onyx_groups = _build_onyx_groups( + drive_ids_to_members_map=drive_ids_to_members_map, + group_to_members_map=group_to_members_map, + ) + + return onyx_groups diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index 9089a551bcc..d16007f52ab 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -258,7 +258,7 @@ def _get_all_user_emails(self) -> list[str]: user_emails.append(email) return user_emails - def _get_all_drive_ids(self) -> set[str]: + def get_all_drive_ids(self) -> set[str]: primary_drive_service = get_drive_service( creds=self.creds, user_email=self.primary_admin_email, @@ -353,7 +353,7 @@ def _manage_service_account_retrieval( ) -> Iterator[GoogleDriveFileType]: all_org_emails: list[str] = self._get_all_user_emails() - all_drive_ids: set[str] = self._get_all_drive_ids() + all_drive_ids: set[str] = self.get_all_drive_ids() drive_ids_to_retrieve: set[str] = set() folder_ids_to_retrieve: set[str] = set() @@ -437,7 +437,7 @@ def _manage_oauth_retrieval( # If all 3 are true, we already yielded from get_all_files_for_oauth return - all_drive_ids = self._get_all_drive_ids() + all_drive_ids = self.get_all_drive_ids() drive_ids_to_retrieve: set[str] = set() folder_ids_to_retrieve: set[str] = set() if self._requested_shared_drive_ids or self._requested_folder_ids: diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index 440e576e0d3..fc89654a43f 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -252,6 +252,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: id=file["webViewLink"], perm_sync_data={ "doc_id": file.get("id"), + "drive_id": file.get("driveId"), "permissions": file.get("permissions", []), "permission_ids": file.get("permissionIds", []), "name": file.get("name"), diff --git a/backend/onyx/connectors/google_drive/file_retrieval.py b/backend/onyx/connectors/google_drive/file_retrieval.py index da5a4bf8d07..4e459bd3bde 100644 --- a/backend/onyx/connectors/google_drive/file_retrieval.py +++ b/backend/onyx/connectors/google_drive/file_retrieval.py @@ -19,7 +19,7 @@ "shortcutDetails, owners(emailAddress), size)" ) SLIM_FILE_FIELDS = ( - "nextPageToken, files(mimeType, id, name, permissions(emailAddress, type), " + "nextPageToken, files(mimeType, driveId, id, name, permissions(emailAddress, type), " "permissionIds, webViewLink, owners(emailAddress))" ) FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)" From fc239aca2dad441ee0792103e17c43f5be3f6f8b Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Wed, 15 Jan 2025 10:15:08 -0800 Subject: [PATCH 2/4] mypy --- .../ee/onyx/external_permissions/google_drive/group_sync.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/ee/onyx/external_permissions/google_drive/group_sync.py b/backend/ee/onyx/external_permissions/google_drive/group_sync.py index 69f7f1ebc01..2a24df0c173 100644 --- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py @@ -87,7 +87,7 @@ def _build_onyx_groups( onyx_groups.append( ExternalUserGroup( id=drive_id, - user_emails=all_member_emails, + user_emails=list(all_member_emails), ) ) @@ -96,7 +96,7 @@ def _build_onyx_groups( onyx_groups.append( ExternalUserGroup( id=group_email, - user_emails=member_emails, + user_emails=list(member_emails), ) ) From 6cc61a07a3822a5a6c2d90f5e89944c4d827d5c7 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 18 Jan 2025 15:08:42 -0800 Subject: [PATCH 3/4] addressed --- .../google_drive/group_sync.py | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/backend/ee/onyx/external_permissions/google_drive/group_sync.py b/backend/ee/onyx/external_permissions/google_drive/group_sync.py index 2a24df0c173..3fb4c76dab2 100644 --- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py @@ -13,9 +13,16 @@ def _get_drive_members( google_drive_connector: GoogleDriveConnector, ) -> dict[str, tuple[set[str], set[str]]]: + """ + This builds a map of drive ids to their members (group and user emails). + E.g. { + "drive_id_1": ({"group_email_1", "group_email_2"}, {"user_email_1", "user_email_2"}), + "drive_id_2": ({"group_email_3"}, {"user_email_3"}), + } + """ drive_ids = google_drive_connector.get_all_drive_ids() - drive_ids_to_members_map: dict[str, tuple[set[str], set[str]]] = {} + drive_id_to_members_map: dict[str, tuple[set[str], set[str]]] = {} drive_service = get_drive_service( google_drive_connector.creds, google_drive_connector.primary_admin_email, @@ -35,14 +42,17 @@ def _get_drive_members( group_emails.add(permission["emailAddress"]) elif permission["type"] == "user": user_emails.add(permission["emailAddress"]) - drive_ids_to_members_map[drive_id] = (group_emails, user_emails) - return drive_ids_to_members_map + drive_id_to_members_map[drive_id] = (group_emails, user_emails) + return drive_id_to_members_map def _get_all_groups( admin_service: AdminService, google_domain: str, ) -> set[str]: + """ + This gets all the group emails. + """ group_emails: set[str] = set() for group in execute_paginated_retrieval( admin_service.groups().list, @@ -58,6 +68,9 @@ def _map_group_to_members( admin_service: AdminService, group_emails: set[str], ) -> dict[str, set[str]]: + """ + This maps group emails to their member emails. + """ group_to_member_map: dict[str, set[str]] = {} for group_email in group_emails: group_member_emails: set[str] = set() @@ -74,13 +87,15 @@ def _map_group_to_members( def _build_onyx_groups( - drive_ids_to_members_map: dict[str, tuple[set[str], set[str]]], + drive_id_to_members_map: dict[str, tuple[set[str], set[str]]], group_to_members_map: dict[str, set[str]], ) -> list[ExternalUserGroup]: onyx_groups: list[ExternalUserGroup] = [] # Convert all drive member definitions to onyx groups - for drive_id, (group_emails, user_emails) in drive_ids_to_members_map.items(): + # This is because having drive level access means you have + # irrevocable access to all the files in the drive. + for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items(): all_member_emails: set[str] = user_emails for group_email in group_emails: all_member_emails.update(group_to_members_map[group_email]) @@ -116,7 +131,7 @@ def gdrive_group_sync( ) # Get all drive members - drive_ids_to_members_map = _get_drive_members(google_drive_connector) + drive_id_to_members_map = _get_drive_members(google_drive_connector) # Get all group emails all_group_emails = _get_all_groups( @@ -128,7 +143,7 @@ def gdrive_group_sync( # Convert the maps to onyx groups onyx_groups = _build_onyx_groups( - drive_ids_to_members_map=drive_ids_to_members_map, + drive_id_to_members_map=drive_id_to_members_map, group_to_members_map=group_to_members_map, ) From e629772af4f9fe965a9ff34aa10bc151f3eba8c5 Mon Sep 17 00:00:00 2001 From: hagen-danswer Date: Sat, 18 Jan 2025 15:34:45 -0800 Subject: [PATCH 4/4] reeeeeeeee --- .../google_drive/doc_sync.py | 10 +++++----- .../google_drive/group_sync.py | 16 +++++++++------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py index eabeeb0a8d1..f5ff08c1691 100644 --- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py @@ -101,14 +101,14 @@ def _get_permissions_from_slim_doc( company_domain = google_drive_connector.google_domain user_emails: set[str] = set() - group_names: set[str] = set() + group_emails: set[str] = set() public = False for permission in permissions_list: permission_type = permission["type"] if permission_type == "user": user_emails.add(permission["emailAddress"]) elif permission_type == "group": - group_names.add(permission["emailAddress"]) + group_emails.add(permission["emailAddress"]) elif permission_type == "domain" and company_domain: if permission.get("domain") == company_domain: public = True @@ -120,12 +120,12 @@ def _get_permissions_from_slim_doc( elif permission_type == "anyone": public = True - if parent_drive_id := permission_info.get("drive_id"): - group_names.add(parent_drive_id) + drive_id = permission_info.get("drive_id") + group_ids = group_emails | ({drive_id} if drive_id is not None else set()) return ExternalAccess( external_user_emails=user_emails, - external_user_group_ids=group_names, + external_user_group_ids=group_ids, is_public=public, ) diff --git a/backend/ee/onyx/external_permissions/google_drive/group_sync.py b/backend/ee/onyx/external_permissions/google_drive/group_sync.py index 3fb4c76dab2..7d1a27dbe91 100644 --- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py @@ -16,7 +16,7 @@ def _get_drive_members( """ This builds a map of drive ids to their members (group and user emails). E.g. { - "drive_id_1": ({"group_email_1", "group_email_2"}, {"user_email_1", "user_email_2"}), + "drive_id_1": ({"group_email_1"}, {"user_email_1", "user_email_2"}), "drive_id_2": ({"group_email_3"}, {"user_email_3"}), } """ @@ -64,7 +64,7 @@ def _get_all_groups( return group_emails -def _map_group_to_members( +def _map_group_email_to_member_emails( admin_service: AdminService, group_emails: set[str], ) -> dict[str, set[str]]: @@ -88,7 +88,7 @@ def _map_group_to_members( def _build_onyx_groups( drive_id_to_members_map: dict[str, tuple[set[str], set[str]]], - group_to_members_map: dict[str, set[str]], + group_email_to_member_emails_map: dict[str, set[str]], ) -> list[ExternalUserGroup]: onyx_groups: list[ExternalUserGroup] = [] @@ -98,7 +98,7 @@ def _build_onyx_groups( for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items(): all_member_emails: set[str] = user_emails for group_email in group_emails: - all_member_emails.update(group_to_members_map[group_email]) + all_member_emails.update(group_email_to_member_emails_map[group_email]) onyx_groups.append( ExternalUserGroup( id=drive_id, @@ -107,7 +107,7 @@ def _build_onyx_groups( ) # Convert all group member definitions to onyx groups - for group_email, member_emails in group_to_members_map.items(): + for group_email, member_emails in group_email_to_member_emails_map.items(): onyx_groups.append( ExternalUserGroup( id=group_email, @@ -139,12 +139,14 @@ def gdrive_group_sync( ) # Map group emails to their members - group_to_members_map = _map_group_to_members(admin_service, all_group_emails) + group_email_to_member_emails_map = _map_group_email_to_member_emails( + admin_service, all_group_emails + ) # Convert the maps to onyx groups onyx_groups = _build_onyx_groups( drive_id_to_members_map=drive_id_to_members_map, - group_to_members_map=group_to_members_map, + group_email_to_member_emails_map=group_email_to_member_emails_map, ) return onyx_groups