diff --git a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py index 381975219ed..f5ff08c1691 100644 --- a/backend/ee/onyx/external_permissions/google_drive/doc_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/doc_sync.py @@ -120,9 +120,12 @@ def _get_permissions_from_slim_doc( elif permission_type == "anyone": public = True + drive_id = permission_info.get("drive_id") + group_ids = group_emails | ({drive_id} if drive_id is not None else set()) + return ExternalAccess( external_user_emails=user_emails, - external_user_group_ids=group_emails, + external_user_group_ids=group_ids, is_public=public, ) diff --git a/backend/ee/onyx/external_permissions/google_drive/group_sync.py b/backend/ee/onyx/external_permissions/google_drive/group_sync.py index 4fc15da00d4..7d1a27dbe91 100644 --- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py +++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py @@ -1,52 +1,152 @@ from ee.onyx.db.external_perm import ExternalUserGroup from onyx.connectors.google_drive.connector import GoogleDriveConnector from onyx.connectors.google_utils.google_utils import execute_paginated_retrieval +from onyx.connectors.google_utils.resources import AdminService from onyx.connectors.google_utils.resources import get_admin_service +from onyx.connectors.google_utils.resources import get_drive_service from onyx.db.models import ConnectorCredentialPair from onyx.utils.logger import setup_logger logger = setup_logger() -def gdrive_group_sync( - cc_pair: ConnectorCredentialPair, -) -> list[ExternalUserGroup]: - google_drive_connector = GoogleDriveConnector( - **cc_pair.connector.connector_specific_config - ) - google_drive_connector.load_credentials(cc_pair.credential.credential_json) - admin_service = get_admin_service( - google_drive_connector.creds, google_drive_connector.primary_admin_email +def _get_drive_members( + google_drive_connector: GoogleDriveConnector, +) -> dict[str, tuple[set[str], set[str]]]: + """ + This builds a map of drive ids to their members (group and user emails). + E.g. { + "drive_id_1": ({"group_email_1"}, {"user_email_1", "user_email_2"}), + "drive_id_2": ({"group_email_3"}, {"user_email_3"}), + } + """ + drive_ids = google_drive_connector.get_all_drive_ids() + + drive_id_to_members_map: dict[str, tuple[set[str], set[str]]] = {} + drive_service = get_drive_service( + google_drive_connector.creds, + google_drive_connector.primary_admin_email, ) - onyx_groups: list[ExternalUserGroup] = [] + for drive_id in drive_ids: + group_emails: set[str] = set() + user_emails: set[str] = set() + for permission in execute_paginated_retrieval( + drive_service.permissions().list, + list_key="permissions", + fileId=drive_id, + fields="permissions(emailAddress, type)", + supportsAllDrives=True, + ): + if permission["type"] == "group": + group_emails.add(permission["emailAddress"]) + elif permission["type"] == "user": + user_emails.add(permission["emailAddress"]) + drive_id_to_members_map[drive_id] = (group_emails, user_emails) + return drive_id_to_members_map + + +def _get_all_groups( + admin_service: AdminService, + google_domain: str, +) -> set[str]: + """ + This gets all the group emails. + """ + group_emails: set[str] = set() for group in execute_paginated_retrieval( admin_service.groups().list, list_key="groups", - domain=google_drive_connector.google_domain, + domain=google_domain, fields="groups(email)", ): - # The id is the group email - group_email = group["email"] + group_emails.add(group["email"]) + return group_emails + - # Gather group member emails - group_member_emails: list[str] = [] +def _map_group_email_to_member_emails( + admin_service: AdminService, + group_emails: set[str], +) -> dict[str, set[str]]: + """ + This maps group emails to their member emails. + """ + group_to_member_map: dict[str, set[str]] = {} + for group_email in group_emails: + group_member_emails: set[str] = set() for member in execute_paginated_retrieval( admin_service.members().list, list_key="members", groupKey=group_email, fields="members(email)", ): - group_member_emails.append(member["email"]) + group_member_emails.add(member["email"]) - if not group_member_emails: - continue + group_to_member_map[group_email] = group_member_emails + return group_to_member_map + +def _build_onyx_groups( + drive_id_to_members_map: dict[str, tuple[set[str], set[str]]], + group_email_to_member_emails_map: dict[str, set[str]], +) -> list[ExternalUserGroup]: + onyx_groups: list[ExternalUserGroup] = [] + + # Convert all drive member definitions to onyx groups + # This is because having drive level access means you have + # irrevocable access to all the files in the drive. + for drive_id, (group_emails, user_emails) in drive_id_to_members_map.items(): + all_member_emails: set[str] = user_emails + for group_email in group_emails: + all_member_emails.update(group_email_to_member_emails_map[group_email]) + onyx_groups.append( + ExternalUserGroup( + id=drive_id, + user_emails=list(all_member_emails), + ) + ) + + # Convert all group member definitions to onyx groups + for group_email, member_emails in group_email_to_member_emails_map.items(): onyx_groups.append( ExternalUserGroup( id=group_email, - user_emails=list(group_member_emails), + user_emails=list(member_emails), ) ) return onyx_groups + + +def gdrive_group_sync( + cc_pair: ConnectorCredentialPair, +) -> list[ExternalUserGroup]: + # Initialize connector and build credential/service objects + google_drive_connector = GoogleDriveConnector( + **cc_pair.connector.connector_specific_config + ) + google_drive_connector.load_credentials(cc_pair.credential.credential_json) + admin_service = get_admin_service( + google_drive_connector.creds, google_drive_connector.primary_admin_email + ) + + # Get all drive members + drive_id_to_members_map = _get_drive_members(google_drive_connector) + + # Get all group emails + all_group_emails = _get_all_groups( + admin_service, google_drive_connector.google_domain + ) + + # Map group emails to their members + group_email_to_member_emails_map = _map_group_email_to_member_emails( + admin_service, all_group_emails + ) + + # Convert the maps to onyx groups + onyx_groups = _build_onyx_groups( + drive_id_to_members_map=drive_id_to_members_map, + group_email_to_member_emails_map=group_email_to_member_emails_map, + ) + + return onyx_groups diff --git a/backend/onyx/connectors/google_drive/connector.py b/backend/onyx/connectors/google_drive/connector.py index 9089a551bcc..d16007f52ab 100644 --- a/backend/onyx/connectors/google_drive/connector.py +++ b/backend/onyx/connectors/google_drive/connector.py @@ -258,7 +258,7 @@ def _get_all_user_emails(self) -> list[str]: user_emails.append(email) return user_emails - def _get_all_drive_ids(self) -> set[str]: + def get_all_drive_ids(self) -> set[str]: primary_drive_service = get_drive_service( creds=self.creds, user_email=self.primary_admin_email, @@ -353,7 +353,7 @@ def _manage_service_account_retrieval( ) -> Iterator[GoogleDriveFileType]: all_org_emails: list[str] = self._get_all_user_emails() - all_drive_ids: set[str] = self._get_all_drive_ids() + all_drive_ids: set[str] = self.get_all_drive_ids() drive_ids_to_retrieve: set[str] = set() folder_ids_to_retrieve: set[str] = set() @@ -437,7 +437,7 @@ def _manage_oauth_retrieval( # If all 3 are true, we already yielded from get_all_files_for_oauth return - all_drive_ids = self._get_all_drive_ids() + all_drive_ids = self.get_all_drive_ids() drive_ids_to_retrieve: set[str] = set() folder_ids_to_retrieve: set[str] = set() if self._requested_shared_drive_ids or self._requested_folder_ids: diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index 440e576e0d3..fc89654a43f 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -252,6 +252,7 @@ def build_slim_document(file: GoogleDriveFileType) -> SlimDocument | None: id=file["webViewLink"], perm_sync_data={ "doc_id": file.get("id"), + "drive_id": file.get("driveId"), "permissions": file.get("permissions", []), "permission_ids": file.get("permissionIds", []), "name": file.get("name"), diff --git a/backend/onyx/connectors/google_drive/file_retrieval.py b/backend/onyx/connectors/google_drive/file_retrieval.py index da5a4bf8d07..4e459bd3bde 100644 --- a/backend/onyx/connectors/google_drive/file_retrieval.py +++ b/backend/onyx/connectors/google_drive/file_retrieval.py @@ -19,7 +19,7 @@ "shortcutDetails, owners(emailAddress), size)" ) SLIM_FILE_FIELDS = ( - "nextPageToken, files(mimeType, id, name, permissions(emailAddress, type), " + "nextPageToken, files(mimeType, driveId, id, name, permissions(emailAddress, type), " "permissionIds, webViewLink, owners(emailAddress))" ) FOLDER_FIELDS = "nextPageToken, files(id, name, permissions, modifiedTime, webViewLink, shortcutDetails)"